You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
1.5 KiB
JavaScript
57 lines
1.5 KiB
JavaScript
"use strict";
|
|
|
|
const Promise = require("bluebird");
|
|
const promiseTaskQueue = require("promise-task-queue");
|
|
const assureArray = require("assure-array");
|
|
|
|
const createScrapeLogger = require("./scrape-logger");
|
|
const normalizeTags = require("./normalize-items");
|
|
|
|
module.exports = function simpleRunner(createScraper) {
|
|
let scraper = createScraper();
|
|
let logger = createScrapeLogger({ name: scraper.name });
|
|
let queue = promiseTaskQueue();
|
|
|
|
queue.define("scrape", (task) => {
|
|
let {url} = task;
|
|
|
|
let updatedUrl = scraper.updateUrl(url);
|
|
|
|
let matchingHandler = scraper.handlers.find(([regex, _handler]) => regex.exec(updatedUrl));
|
|
|
|
if (matchingHandler == null) {
|
|
logger.error(`Scraper does not have a handler for URL in queue: ${updatedUrl}`);
|
|
} else {
|
|
return Promise.try(() => {
|
|
let [_regex, handler] = matchingHandler;
|
|
|
|
return handler(updatedUrl);
|
|
}).then(({urls, items}) => {
|
|
if (urls != null) {
|
|
urls.forEach((url) => {
|
|
logger.debug(`Queueing new URL: ${url}`);
|
|
queue.push("scrape", {url: url});
|
|
});
|
|
}
|
|
|
|
if (items != null) {
|
|
items.forEach((item) => {
|
|
let normalizedItem = normalizeTags(item);
|
|
logger.done(normalizedItem);
|
|
logger.debug(`Encountered item`, normalizedItem);
|
|
});
|
|
}
|
|
|
|
return null;
|
|
});
|
|
}
|
|
}, { interval: 0.5 });
|
|
|
|
return Promise.try(() => {
|
|
return scraper.initialize();
|
|
}).then((rootUrls) => {
|
|
assureArray(rootUrls).forEach((url) => {
|
|
queue.push("scrape", {url: url});
|
|
});
|
|
});
|
|
}; |