You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
4.2 KiB
JavaScript
133 lines
4.2 KiB
JavaScript
"use strict";
|
|
|
|
const Promise = require("bluebird");
|
|
const promiseWhile = require("promise-while-loop");
|
|
const databaseError = require("database-error");
|
|
const assureArray = require("assure-array");
|
|
|
|
module.exports = function ({ db }) {
|
|
return {
|
|
getScraperId: function ({ name }) {
|
|
return Promise.try(() => {
|
|
return db("scrapers")
|
|
.insert({ name: name })
|
|
.catch(databaseError.rethrow)
|
|
.catch({ name: "UniqueConstraintViolationError" }, (err) => null);
|
|
}).then(() => {
|
|
return db("scrapers").first().where({ name: name });
|
|
}).then((result) => {
|
|
return result.id;
|
|
});
|
|
},
|
|
getBatch: function ({ scraperId, amount }) {
|
|
return Promise.try(() => {
|
|
return db("urls")
|
|
.where({
|
|
scraper_id: scraperId,
|
|
scraped: false
|
|
})
|
|
.orderBy([
|
|
{ column: "depth", order: "desc" },
|
|
{ column: "id", order: "asc" }
|
|
])
|
|
.limit(amount);
|
|
}).then((urls) => {
|
|
if (urls.length === 0) {
|
|
return [];
|
|
} else {
|
|
let highestDepth = urls[0].depth;
|
|
|
|
return urls.filter((url) => url.depth === highestDepth);
|
|
}
|
|
}).catch(databaseError.rethrow);
|
|
},
|
|
insertRootUrl: function ({ scraperId, url, description, metadata }) {
|
|
return db("urls").insert({
|
|
scraper_id: scraperId,
|
|
url: url,
|
|
description: description,
|
|
metadata: metadata,
|
|
depth: 0,
|
|
scraped_at: new Date()
|
|
}).catch(databaseError.rethrow);
|
|
},
|
|
getRootOrBatch: function ({ scraperId, rootUrls, amount }) {
|
|
return Promise.try(() => {
|
|
return this.getBatch({ amount, scraperId });
|
|
}).then((results) => {
|
|
if (results.length > 0) {
|
|
return results;
|
|
} else {
|
|
return Promise.try(() => {
|
|
return Promise.map(assureArray(rootUrls), (rootUrl) => {
|
|
return this.insertRootUrl({
|
|
scraperId: scraperId,
|
|
url: rootUrl.url,
|
|
description: rootUrl.description
|
|
});
|
|
});
|
|
}).then(() => {
|
|
return this.getBatch({ amount, scraperId });
|
|
});
|
|
}
|
|
}).catch(databaseError.rethrow);
|
|
},
|
|
addUrlToQueue: function ({ scraperId, url, description, metadata, parentUrlId, depth }) {
|
|
if (parentUrlId == null && depth > 0) {
|
|
throw new Error("Parent URL ID is required for a URL with a depth higher than zero");
|
|
} else {
|
|
return Promise.try(() => {
|
|
return db("urls").insert({
|
|
scraper_id: scraperId,
|
|
parent_id: parentUrlId,
|
|
depth: depth,
|
|
url: url,
|
|
description: description,
|
|
metadata: metadata,
|
|
scraped_at: new Date()
|
|
});
|
|
}).catch(databaseError.rethrow);
|
|
}
|
|
},
|
|
getAncestorsForUrl: function ({ urlId }) {
|
|
return Promise.try(() => {
|
|
return promiseWhile((lastResult) => lastResult != null, (lastResult) => {
|
|
return db("urls").where({
|
|
id: (lastResult != null)
|
|
? lastResult.parent_id
|
|
: urlId
|
|
}).first();
|
|
});
|
|
}).then((ancestors) => {
|
|
let depthsSeen = new Set();
|
|
|
|
return ancestors
|
|
/* Get rid of the `undefined` at the end of the ancestors list, and the URL we originally passed in at the start. */
|
|
.slice(1, -1)
|
|
/* We filter the list of ancestors such that only the *last-encountered* item for a particular depth ends up in the ancestor list, to prevent unnecessarily re-retrieving many pages when pagination is in use (which results in many ancestors at the same depth). This approach is based on the assumption that each request needed to re-establish a session for a given URL has its own dedicated depth. Since we retrieve the list of ancestors in reverse order, we let through the *first* ancestor at a given depth in the code below. */
|
|
.filter((ancestor) => {
|
|
if (depthsSeen.has(ancestor.depth)) {
|
|
return false;
|
|
} else {
|
|
depthsSeen.add(ancestor.depth);
|
|
return true;
|
|
}
|
|
});
|
|
}).catch(databaseError.rethrow);
|
|
},
|
|
storeItem: function ({ scraperId, urlId, data, isPartial }) {
|
|
return db("items").insert({
|
|
scraper_id: scraperId,
|
|
url_id: urlId,
|
|
data: data,
|
|
is_partial: isPartial,
|
|
scraped_at: new Date()
|
|
}).catch(databaseError.rethrow);
|
|
},
|
|
markUrlDone: function ({ urlId }) {
|
|
return Promise.try(() => {
|
|
return db("urls").update({ scraped: true }).where({ id: urlId });
|
|
}).catch(databaseError.rethrow);
|
|
}
|
|
}
|
|
}; |