manuals-r-us/lib/db-runner/db.js

"use strict";

const Promise = require("bluebird");
const promiseWhile = require("promise-while-loop");
const databaseError = require("database-error");
const assureArray = require("assure-array");

module.exports = function ({ db }) {
	return {
		getScraperId: function ({ name }) {
			return Promise.try(() => {
				return db("scrapers")
					.insert({ name: name })
					.catch(databaseError.rethrow)
					.catch({ name: "UniqueConstraintViolationError" }, (err) => null);
			}).then(() => {
				return db("scrapers").first().where({ name: name });
			}).then((result) => {
				return result.id;
			});
		},
		getBatch: function ({ scraperId, amount }) {
			return Promise.try(() => {
				return db("urls")
					.where({
						scraper_id: scraperId,
						scraped: false
					})
					.orderBy([
						{ column: "depth", order: "desc" },
						{ column: "id", order: "asc" }
					])
					.limit(amount);
			}).then((urls) => {
				if (urls.length === 0) {
					return [];
				} else {
					let highestDepth = urls[0].depth;

					return urls.filter((url) => url.depth === highestDepth);
				}
			}).catch(databaseError.rethrow);
		},
		insertRootUrl: function ({ scraperId, url, description, metadata }) {
			return db("urls").insert({
				scraper_id: scraperId,
				url: url,
				description: description,
				metadata: metadata,
				depth: 0,
				scraped_at: new Date()
			}).catch(databaseError.rethrow);
		},
		getRootOrBatch: function ({ scraperId, rootUrls, amount }) {
			return Promise.try(() => {
				return this.getBatch({ amount, scraperId });
			}).then((results) => {
				if (results.length > 0) {
					return results;
				} else {
					return Promise.try(() => {
						return Promise.map(assureArray(rootUrls), (rootUrl) => {
							return this.insertRootUrl({
								scraperId: scraperId,
								url: rootUrl.url,
								description: rootUrl.description
							});
						});
					}).then(() => {
						return this.getBatch({ amount, scraperId });
					});
				}
			}).catch(databaseError.rethrow);
		},
		addUrlToQueue: function ({ scraperId, url, description, metadata, parentUrlId, depth }) {
			if (parentUrlId == null && depth > 0) {
				throw new Error("Parent URL ID is required for a URL with a depth higher than zero");
			} else {
				return Promise.try(() => {
					return db("urls").insert({
						scraper_id: scraperId,
						parent_id: parentUrlId,
						depth: depth,
						url: url,
						description: description,
						metadata: metadata,
						scraped_at: new Date()
					});
				}).catch(databaseError.rethrow);
			}
		},
		getAncestorsForUrl: function ({ urlId }) {
			return Promise.try(() => {
				return promiseWhile((lastResult) => lastResult != null, (lastResult) => {
					return db("urls").where({
						id: (lastResult != null)
							? lastResult.parent_id
							: urlId
					}).first();
				});
			}).then((ancestors) => {
				let depthsSeen = new Set();

				return ancestors
					/* Get rid of the `undefined` at the end of the ancestors list, and the URL we originally passed in at the start. */
					.slice(1, -1)
					/* We filter the list of ancestors such that only the *last-encountered* item for a particular depth ends up in the ancestor list, to prevent unnecessarily re-retrieving many pages when pagination is in use (which results in many ancestors at the same depth). This approach is based on the assumption that each request needed to re-establish a session for a given URL has its own dedicated depth. Since we retrieve the list of ancestors in reverse order, we let through the *first* ancestor at a given depth in the code below. */
					.filter((ancestor) => {
						if (depthsSeen.has(ancestor.depth)) {
							return false;
						} else {
							depthsSeen.add(ancestor.depth);
							return true;
						}
					});
			}).catch(databaseError.rethrow);
		},
		storeItem: function ({ scraperId, urlId, data, isPartial }) {
			return db("items").insert({
				scraper_id: scraperId,
				url_id: urlId,
				data: data,
				is_partial: isPartial,
				scraped_at: new Date()
			}).catch(databaseError.rethrow);
		},
		markUrlDone: function ({ urlId }) {
			return Promise.try(() => {
				return db("urls").update({ scraped: true }).where({ id: urlId });
			}).catch(databaseError.rethrow);
		}
	}
};