You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
3.6 KiB
JavaScript

'use strict';
const Promise = require("bluebird");
const bhttp = require("bhttp");
const promiseTaskQueue = require("promise-task-queue");
const createEventEmitter = require("create-event-emitter");
const defaultValue = require("default-value");
const debug = require("debug")("pastebinStream:scrapers:pastebinCom");
const promiseSetInterval = require("../promise-set-interval");
const errors = require("../errors");
function tryParseBody(body) {
try {
return JSON.parse(body);
} catch (err) {
throw new errors.HttpError(`Got rate-limited? Error message: ${body}`, {type: "rateLimited"});
}
}
module.exports = function createPastebinComScraper(options = {}) {
let queue = promiseTaskQueue();
let knownPastes = [];
let previousKnownPastes = [];
if (options.pasteInterval == null) {
throw new Error("A `pasteInterval` is required in the `pastebinCom` scraper configuration, but none was provided");
}
debug(`Initializing with pasteInterval ${options.pasteInterval}, listInterval ${options.listInterval}, and listLimit ${options.listLimit}`);
queue.define("fetchPaste", (task) => {
return Promise.try(() => {
debug(`Fetching paste ID ${task.pasteKey}...`)
return bhttp.get(`https://scrape.pastebin.com/api_scrape_item.php?i=${task.pasteKey}`);
}).then((response) => {
if (response.statusCode !== 200) {
if (response.statusCode >= 500 && response.statusCode < 600) {
// FIXME: The below logic could lead to an infinite recursion if Pastebin never comes back up
return queue.push("fetchPaste", task);
} else {
throw new errors.HttpError(`Encountered a non-200 status code for Pastebin.com while retrieving a raw paste: ${response.statusCode}`, {type: "statusCode"});
}
} else if (response.body.toString().indexOf("Please slow down, you are hitting our servers unnecessarily hard!") === 0) {
throw new errors.HttpError("Got rate-limited!", {type: "rateLimited"});
} else {
return response.body.toString();
}
})
}, {
interval: options.pasteInterval
});
let loop;
return createEventEmitter({
stop: function stopScraper() {
if (loop != null) {
this.emit("stopped");
loop.cancel();
}
},
start: function startScraper() {
loop = promiseSetInterval(() => {
return Promise.try(() => {
return bhttp.get(`https://scrape.pastebin.com/api_scraping.php?limit=${defaultValue(options.listLimit, 100)}`, {
noDecode: true /* Because Pastebin.com errors aren't JSON... */
});
}).then((response) => {
if (response.statusCode !== 200) {
throw new errors.HttpError(`Encountered a non-200 status code for Pastebin.com while listing the most recent pastes: ${response.statusCode}`, {type: "statusCode"});
} else {
return tryParseBody(response.body).reverse();
}
}).tap((pastes) => {
previousKnownPastes = knownPastes;
knownPastes = pastes.map(paste => paste.key);
}).filter((paste) => {
return (!previousKnownPastes.includes(paste.key));
}).tap((pastes) => {
debug(`Found ${pastes.length} new pastes`);
}).each((paste) => {
/* We *intentionally* do not return the Promise chain below; we don't want to block the interval with queue items. */
Promise.try(() => {
return queue.push("fetchPaste", {
pasteKey: paste.key
});
}).then((rawPaste) => {
this.emit("paste", Object.assign({
raw: rawPaste
}, paste));
}).catch((err) => {
this.emit("error", err);
});
}).catch((err) => {
/* This is where eg. rate-limiting errors will end up. */
this.emit("error", err);
});
}, defaultValue(options.listInterval, 60) * 1000, {
startImmediately: true
});
}
});
};