You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
1.5 KiB
JavaScript

"use strict";
const Promise = require("bluebird");
const promiseTaskQueue = require("promise-task-queue");
const assureArray = require("assure-array");
const createScrapeLogger = require("./scrape-logger");
const normalizeTags = require("./normalize-items");
module.exports = function simpleRunner(createScraper) {
let scraper = createScraper();
let logger = createScrapeLogger({ name: scraper.name });
let queue = promiseTaskQueue();
queue.define("scrape", (task) => {
let {url} = task;
let updatedUrl = scraper.updateUrl(url);
let matchingHandler = scraper.handlers.find(([regex, _handler]) => regex.exec(updatedUrl));
if (matchingHandler == null) {
logger.error(`Scraper does not have a handler for URL in queue: ${updatedUrl}`);
} else {
return Promise.try(() => {
let [_regex, handler] = matchingHandler;
return handler(updatedUrl);
}).then(({urls, items}) => {
if (urls != null) {
urls.forEach((url) => {
logger.debug(`Queueing new URL: ${url}`);
queue.push("scrape", {url: url});
});
}
if (items != null) {
items.forEach((item) => {
let normalizedItem = normalizeTags(item);
logger.done(normalizedItem);
logger.debug(`Encountered item`, normalizedItem);
});
}
return null;
});
}
}, { interval: 0.5 });
return Promise.try(() => {
return scraper.initialize();
}).then((rootUrls) => {
assureArray(rootUrls).forEach((url) => {
queue.push("scrape", {url: url});
});
});
};