"use strict"; const Promise = require("bluebird"); const promiseWhile = require("promise-while-loop"); const databaseError = require("database-error"); const assureArray = require("assure-array"); module.exports = function ({ db }) { return { getScraperId: function ({ name }) { return Promise.try(() => { return db("scrapers") .insert({ name: name }) .catch(databaseError.rethrow) .catch({ name: "UniqueConstraintViolationError" }, (err) => null); }).then(() => { return db("scrapers").first().where({ name: name }); }).then((result) => { return result.id; }); }, getBatch: function ({ scraperId, amount }) { return Promise.try(() => { return db("urls") .where({ scraper_id: scraperId, scraped: false }) .orderBy([ { column: "depth", order: "desc" }, { column: "id", order: "asc" } ]) .limit(amount); }).then((urls) => { if (urls.length === 0) { return []; } else { let highestDepth = urls[0].depth; return urls.filter((url) => url.depth === highestDepth); } }).catch(databaseError.rethrow); }, insertRootUrl: function ({ scraperId, url, description, metadata }) { return db("urls").insert({ scraper_id: scraperId, url: url, description: description, metadata: metadata, depth: 0, scraped_at: new Date() }).catch(databaseError.rethrow); }, getRootOrBatch: function ({ scraperId, rootUrls, amount }) { return Promise.try(() => { return this.getBatch({ amount, scraperId }); }).then((results) => { if (results.length > 0) { return results; } else { return Promise.try(() => { return Promise.map(assureArray(rootUrls), (rootUrl) => { return this.insertRootUrl({ scraperId: scraperId, url: rootUrl.url, description: rootUrl.description }); }); }).then(() => { return this.getBatch({ amount, scraperId }); }); } }).catch(databaseError.rethrow); }, addUrlToQueue: function ({ scraperId, url, description, metadata, parentUrlId, depth }) { if (parentUrlId == null && depth > 0) { throw new Error("Parent URL ID is required for a URL with a depth higher than zero"); } else { return Promise.try(() => { return db("urls").insert({ scraper_id: scraperId, parent_id: parentUrlId, depth: depth, url: url, description: description, metadata: metadata, scraped_at: new Date() }); }).catch(databaseError.rethrow); } }, getAncestorsForUrl: function ({ urlId }) { return Promise.try(() => { return promiseWhile((lastResult) => lastResult != null, (lastResult) => { return db("urls").where({ id: (lastResult != null) ? lastResult.parent_id : urlId }).first(); }); }).then((ancestors) => { let depthsSeen = new Set(); return ancestors /* Get rid of the `undefined` at the end of the ancestors list, and the URL we originally passed in at the start. */ .slice(1, -1) /* We filter the list of ancestors such that only the *last-encountered* item for a particular depth ends up in the ancestor list, to prevent unnecessarily re-retrieving many pages when pagination is in use (which results in many ancestors at the same depth). This approach is based on the assumption that each request needed to re-establish a session for a given URL has its own dedicated depth. Since we retrieve the list of ancestors in reverse order, we let through the *first* ancestor at a given depth in the code below. */ .filter((ancestor) => { if (depthsSeen.has(ancestor.depth)) { return false; } else { depthsSeen.add(ancestor.depth); return true; } }); }).catch(databaseError.rethrow); }, storeItem: function ({ scraperId, urlId, data, isPartial }) { return db("items").insert({ scraper_id: scraperId, url_id: urlId, data: data, is_partial: isPartial, scraped_at: new Date() }).catch(databaseError.rethrow); }, markUrlDone: function ({ urlId }) { return Promise.try(() => { return db("urls").update({ scraped: true }).where({ id: urlId }); }).catch(databaseError.rethrow); } } };