You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
4.2 KiB
JavaScript

"use strict";
const Promise = require("bluebird");
const promiseWhile = require("promise-while-loop");
const databaseError = require("database-error");
const assureArray = require("assure-array");
module.exports = function ({ db }) {
return {
getScraperId: function ({ name }) {
return Promise.try(() => {
return db("scrapers")
.insert({ name: name })
.catch(databaseError.rethrow)
.catch({ name: "UniqueConstraintViolationError" }, (err) => null);
}).then(() => {
return db("scrapers").first().where({ name: name });
}).then((result) => {
return result.id;
});
},
getBatch: function ({ scraperId, amount }) {
return Promise.try(() => {
return db("urls")
.where({
scraper_id: scraperId,
scraped: false
})
.orderBy([
{ column: "depth", order: "desc" },
{ column: "id", order: "asc" }
])
.limit(amount);
}).then((urls) => {
if (urls.length === 0) {
return [];
} else {
let highestDepth = urls[0].depth;
return urls.filter((url) => url.depth === highestDepth);
}
}).catch(databaseError.rethrow);
},
insertRootUrl: function ({ scraperId, url, description, metadata }) {
return db("urls").insert({
scraper_id: scraperId,
url: url,
description: description,
metadata: metadata,
depth: 0,
scraped_at: new Date()
}).catch(databaseError.rethrow);
},
getRootOrBatch: function ({ scraperId, rootUrls, amount }) {
return Promise.try(() => {
return this.getBatch({ amount, scraperId });
}).then((results) => {
if (results.length > 0) {
return results;
} else {
return Promise.try(() => {
return Promise.map(assureArray(rootUrls), (rootUrl) => {
return this.insertRootUrl({
scraperId: scraperId,
url: rootUrl.url,
description: rootUrl.description
});
});
}).then(() => {
return this.getBatch({ amount, scraperId });
});
}
}).catch(databaseError.rethrow);
},
addUrlToQueue: function ({ scraperId, url, description, metadata, parentUrlId, depth }) {
if (parentUrlId == null && depth > 0) {
throw new Error("Parent URL ID is required for a URL with a depth higher than zero");
} else {
return Promise.try(() => {
return db("urls").insert({
scraper_id: scraperId,
parent_id: parentUrlId,
depth: depth,
url: url,
description: description,
metadata: metadata,
scraped_at: new Date()
});
}).catch(databaseError.rethrow);
}
},
getAncestorsForUrl: function ({ urlId }) {
return Promise.try(() => {
return promiseWhile((lastResult) => lastResult != null, (lastResult) => {
return db("urls").where({
id: (lastResult != null)
? lastResult.parent_id
: urlId
}).first();
});
}).then((ancestors) => {
let depthsSeen = new Set();
return ancestors
/* Get rid of the `undefined` at the end of the ancestors list, and the URL we originally passed in at the start. */
.slice(1, -1)
/* We filter the list of ancestors such that only the *last-encountered* item for a particular depth ends up in the ancestor list, to prevent unnecessarily re-retrieving many pages when pagination is in use (which results in many ancestors at the same depth). This approach is based on the assumption that each request needed to re-establish a session for a given URL has its own dedicated depth. Since we retrieve the list of ancestors in reverse order, we let through the *first* ancestor at a given depth in the code below. */
.filter((ancestor) => {
if (depthsSeen.has(ancestor.depth)) {
return false;
} else {
depthsSeen.add(ancestor.depth);
return true;
}
});
}).catch(databaseError.rethrow);
},
storeItem: function ({ scraperId, urlId, data, isPartial }) {
return db("items").insert({
scraper_id: scraperId,
url_id: urlId,
data: data,
is_partial: isPartial,
scraped_at: new Date()
}).catch(databaseError.rethrow);
},
markUrlDone: function ({ urlId }) {
return Promise.try(() => {
return db("urls").update({ scraped: true }).where({ id: urlId });
}).catch(databaseError.rethrow);
}
}
};