master
Sven Slootweg 5 years ago
commit c8b6870a48

2
.gitignore vendored

@ -0,0 +1,2 @@
node_modules
config.json

@ -0,0 +1,11 @@
"use strict";
const knex = require("knex");
// const simpleRunner = require("./lib/simple-runner");
const dbRunner = require("./lib/db-runner");
const createLidlServiceScraper = require("./lib/scrapers/lidl-service");
let db = knex(require("./knexfile"));
dbRunner(createLidlServiceScraper, { db: db });

@ -0,0 +1,20 @@
"use strict";
const url = require("url");
const config = require("./config.json");
module.exports = {
client: "pg",
// debug: true,
connection: {
connectionString: url.format({
protocol: "socket",
slashes: false,
pathname: config.database.socket,
query: {
db: config.database.database
}
})
}
};

@ -0,0 +1,133 @@
"use strict";
const Promise = require("bluebird");
const promiseWhile = require("promise-while-loop");
const databaseError = require("database-error");
const assureArray = require("assure-array");
module.exports = function ({ db }) {
return {
getScraperId: function ({ name }) {
return Promise.try(() => {
return db("scrapers")
.insert({ name: name })
.catch(databaseError.rethrow)
.catch({ name: "UniqueConstraintViolationError" }, (err) => null);
}).then(() => {
return db("scrapers").first().where({ name: name });
}).then((result) => {
return result.id;
});
},
getBatch: function ({ scraperId, amount }) {
return Promise.try(() => {
return db("urls")
.where({
scraper_id: scraperId,
scraped: false
})
.orderBy([
{ column: "depth", order: "desc" },
{ column: "id", order: "asc" }
])
.limit(amount);
}).then((urls) => {
if (urls.length === 0) {
return [];
} else {
let highestDepth = urls[0].depth;
return urls.filter((url) => url.depth === highestDepth);
}
}).catch(databaseError.rethrow);
},
insertRootUrl: function ({ scraperId, url, description, metadata }) {
return db("urls").insert({
scraper_id: scraperId,
url: url,
description: description,
metadata: metadata,
depth: 0,
scraped_at: new Date()
}).catch(databaseError.rethrow);
},
getRootOrBatch: function ({ scraperId, rootUrls, amount }) {
return Promise.try(() => {
return this.getBatch({ amount, scraperId });
}).then((results) => {
if (results.length > 0) {
return results;
} else {
return Promise.try(() => {
return Promise.map(assureArray(rootUrls), (rootUrl) => {
return this.insertRootUrl({
scraperId: scraperId,
url: rootUrl.url,
description: rootUrl.description
});
});
}).then(() => {
return this.getBatch({ amount, scraperId });
});
}
}).catch(databaseError.rethrow);
},
addUrlToQueue: function ({ scraperId, url, description, metadata, parentUrlId, depth }) {
if (parentUrlId == null && depth > 0) {
throw new Error("Parent URL ID is required for a URL with a depth higher than zero");
} else {
return Promise.try(() => {
return db("urls").insert({
scraper_id: scraperId,
parent_id: parentUrlId,
depth: depth,
url: url,
description: description,
metadata: metadata,
scraped_at: new Date()
});
}).catch(databaseError.rethrow);
}
},
getAncestorsForUrl: function ({ urlId }) {
return Promise.try(() => {
return promiseWhile((lastResult) => lastResult != null, (lastResult) => {
return db("urls").where({
id: (lastResult != null)
? lastResult.parent_id
: urlId
}).first();
});
}).then((ancestors) => {
let depthsSeen = new Set();
return ancestors
/* Get rid of the `undefined` at the end of the ancestors list, and the URL we originally passed in at the start. */
.slice(1, -1)
/* We filter the list of ancestors such that only the *last-encountered* item for a particular depth ends up in the ancestor list, to prevent unnecessarily re-retrieving many pages when pagination is in use (which results in many ancestors at the same depth). This approach is based on the assumption that each request needed to re-establish a session for a given URL has its own dedicated depth. Since we retrieve the list of ancestors in reverse order, we let through the *first* ancestor at a given depth in the code below. */
.filter((ancestor) => {
if (depthsSeen.has(ancestor.depth)) {
return false;
} else {
depthsSeen.add(ancestor.depth);
return true;
}
});
}).catch(databaseError.rethrow);
},
storeItem: function ({ scraperId, urlId, data, isPartial }) {
return db("items").insert({
scraper_id: scraperId,
url_id: urlId,
data: data,
is_partial: isPartial,
scraped_at: new Date()
}).catch(databaseError.rethrow);
},
markUrlDone: function ({ urlId }) {
return Promise.try(() => {
return db("urls").update({ scraped: true }).where({ id: urlId });
}).catch(databaseError.rethrow);
}
}
};

@ -0,0 +1,26 @@
"use strict";
const Promise = require("bluebird");
module.exports = function runDisposableQueue(items, processingCallback) {
let itemIndex = 0; // FIXME: overflow
function processItem() {
let disposeFlag = false;
return Promise.try(() => {
let item = items[itemIndex];
itemIndex += 1;
return processingCallback(item, () => {
disposeFlag = true;
});
}).then(() => {
if (disposeFlag === false && itemIndex < items.length) {
return processItem();
}
});
}
return processItem();
};

@ -0,0 +1,180 @@
"use strict";
const Promise = require("bluebird");
const defaultValue = require("default-value");
const createScrapeLogger = require("../scrape-logger");
const normalizeItems = require("../normalize-items");
const normalizeUrls = require("../normalize-urls");
const createDbApi = require("./db");
const runDisposableQueue = require("./disposable-queue");
const errors = require("../errors");
const itemHasPartialData = require("../item-has-partial-data");
const findScraperHandler = require("../find-scraper-handler");
module.exports = function createDbRunner(createScraper, { db }) {
let scraper = createScraper();
let logger = createScrapeLogger({ name: scraper.name });
let dbApi = createDbApi({db});
return Promise.try(() => {
logger.log("Initializing database...");
/* MARKER: Switch to dbApi */
return dbApi.getScraperId({ name: scraper.name });
}).then((scraperId) => {
function restoreState(url) {
/* NOTE: This will re-fetch every ancestor of the specified URL, from the least-depth to the most-depth ancestor. This will gradually reconstruct any (session) state needed to correctly fetch the URL in question, and continue processing the queue. This only needs to be done for the very first queued URL after crash recovery, since from that point on the state will be identical to before the crash. */
return Promise.try(() => {
return dbApi.getAncestorsForUrl({ urlId: url.id });
}).then((ancestors) => {
ancestors.reverse();
return Promise.each(ancestors, (ancestorUrl) => {
/* NOTE: We ignore the returned values here. */
logger.debug(`Loading URL for session recovery: ${ancestorUrl.url}`)
return runItem(ancestorUrl);
});
});
}
function runItem(parentUrl) {
return Promise.try(() => {
let updatedUrl = scraper.updateUrl(parentUrl.url);
let handler = findScraperHandler(scraper, updatedUrl);
return Promise.try(() => {
// logger.debug(`Scraping: ${updatedUrl}`);
if (parentUrl.description != null) {
logger.info(`Scraping: ${parentUrl.description}`);
logger.debug(`URL: ${updatedUrl}`)
} else {
logger.info(`Scraping: ${updatedUrl}`);
}
return handler(updatedUrl, {
description: parentUrl.description,
metadata: defaultValue(parentUrl.metadata, {})
});
}).then((result) => {
let normalizedUrls = normalizeUrls(result.urls, { parentUrl: parentUrl });
let normalizedItems = normalizeItems(result.items);
return {
urls: normalizedUrls,
items: normalizedItems
};
});
});
}
function runBatch(batch) {
return Promise.try(() => {
return runDisposableQueue(batch, (parentUrl, disposeQueue) => {
return Promise.try(() => {
return runItem(parentUrl);
}).then(({urls, items}) => {
if (urls.some((url) => url.depth > parentUrl.depth)) {
/* Since we always want to scrape depth-first, we always want our processing queue to only contain the highest-depth items that exist in the database at that time. Therefore, if we encounter any new to-be-queued URLs that are going to have a higher depth than the current item, we immediately throw away the entirety of the queue, triggering a re-fetch from the database for a new batch at the new maximum depth. */
logger.debug(`Disposing URL queue`);
disposeQueue();
}
return Promise.all([
Promise.map(urls, (url) => {
if (url.description != null) {
logger.debug(`Queueing new URL: ${url.description} (${url.url})`);
} else {
logger.debug(`Queueing new URL: ${url.url}`);
}
return dbApi.addUrlToQueue({
scraperId: scraperId,
parentUrlId: parentUrl.id,
depth: url.depth,
url: url.url,
description: url.description,
metadata: url.metadata
});
}),
Promise.map(items, (item) => {
logger.done(item);
logger.debug(`Encountered item`, item);
return dbApi.storeItem({
scraperId: scraperId,
urlId: parentUrl.id,
data: item,
isPartial: itemHasPartialData(item)
});
})
]);
}).then(() => {
return dbApi.markUrlDone({ urlId: parentUrl.id });
}).catch(errors.NoHandler, (err) => {
logger.error(err.message);
});
});
}).then(() => {
return dbApi.getBatch({
scraperId: scraperId,
amount: 20
});
}).then((nextBatch) => {
if (nextBatch.length > 0) {
return runBatch(nextBatch);
}
});
}
return Promise.try(() => {
return scraper.initialize();
}).then((rootUrls) => {
return dbApi.getRootOrBatch({
scraperId: scraperId,
rootUrls: normalizeUrls(rootUrls),
amount: 20
});
}).then((firstBatch) => {
if (firstBatch.length > 0) {
return Promise.try(() => {
if (firstBatch[0].depth > 0) {
/* We only need to do this for URLs other than the root URLs. */
return restoreState(firstBatch[0]);
}
}).then(() => {
return runBatch(firstBatch);
});
} else {
throw new Error("No URLs in queue after scraper initialization");
}
});
// return Promise.try(() => {
// return Promise.all([
// getBatch(20),
// scraper.initialize()
// ]);
// }).then(([batch, rootUrls]) => {
// if (batch.length === 0) {
// return Promise.try(() => {
// return Promise.map(assureArray(rootUrls), (rootUrl) => {
// return db("urls").insert({
// url: rootUrl,
// depth: null,
// scraper_id: scraperId
// });
// });
// }).then(() => {
// return getBatch(20);
// });
// } else {
// /* NOTE: We ignore the root URLs if there are already unscraped URLs in the database. */
// return batch;
// }
// }).then((batch) => {
// /* MARKER: Implement runBatch, recursively get and process a new batch */
// return runBatch(batch);
// });
});
};

@ -0,0 +1,7 @@
"use strict";
const createError = require("create-error");
module.exports = {
NoHandler: createError("NoHandler")
};

@ -0,0 +1,13 @@
"use strict";
module.exports = function findScraperHandler(scraper, targetUrl) {
let result = scraper.handlers.find(([regex, _handler]) => regex.exec(targetUrl));
if (result != null) {
let [_regex, _type, handler] = result;
return handler;
} else {
throw new errors.NoHandler(`Scraper does not have a handler for URL in queue: ${updatedUrl}`);
}
};

@ -0,0 +1,8 @@
"use strict";
module.exports = function itemHasPartialData(item) {
return Object.keys(item.tags).some((property) => {
let propertyTags = item.tags[property];
return propertyTags.includes("partialData");
});
};

@ -0,0 +1,15 @@
"use strict";
const url = require("url");
module.exports = function mergeUrl(baseUrl, newProps) {
let parsedBaseUrl = url.parse(baseUrl);
let fullNewProps = Object.assign(newProps);
if (fullNewProps.query != null) {
fullNewProps.search = null;
}
return url.format(Object.assign({}, parsedBaseUrl, fullNewProps));
};

@ -0,0 +1,41 @@
"use strict";
const assureArray = require("assure-array");
function normalizeTags(item) {
let tags = { _item: [] };
let data = {};
let canonicalItem = item;
while (canonicalItem.__tag != null) {
tags._item.push(canonicalItem.__tag);
canonicalItem = canonicalItem.__content;
}
for (let property of Object.keys(canonicalItem)) {
let value = canonicalItem[property];
tags[property] = [];
while (value != null && value.__tag != null) {
tags[property].push(value.__tag);
value = value.__contents;
}
data[property] = value;
}
return {
tags: tags,
data: data
};
}
module.exports = function normalizeItems(items) {
if (items == null) {
return [];
} else {
return assureArray(items).map((item) => normalizeTags(item));
}
};

@ -0,0 +1,31 @@
"use strict";
const assureArray = require("assure-array");
function objectifyUrl(url) {
if (typeof url === "string") {
return { url: url };
} else {
return url;
}
}
module.exports = function normalizeUrls(urls, {parentUrl} = {}) {
if (urls == null) {
return [];
} else {
return assureArray(urls).map((url) => {
let urlObject = objectifyUrl(url);
if (parentUrl != null) {
urlObject.depth = (urlObject.sameDepth)
? parentUrl.depth
: parentUrl.depth + 1;
urlObject.parentUrlId = parentUrl.id;
}
return urlObject;
});
}
};

@ -0,0 +1,8 @@
"use strict";
module.exports = function markPartialData(data) {
return {
__tag: "partialData",
__contents: data
};
};

@ -0,0 +1,81 @@
"use strict";
const chalk = require("chalk");
const util = require("util");
const itemHasPartialData = require("./item-has-partial-data");
function concatenateArguments(args) {
let stringified = args.map((arg) => {
if (typeof arg === "string") {
return arg;
} else {
return util.inspect(arg, {depth: null, colors: false});
}
});
return stringified.join(" ");
}
module.exports = function createScrapeLogger({name}) {
return {
log: function log(message) {
console.log(`${chalk.bold.gray(`[${name}]`)} ${message}`);
},
warning: function logWarning(...args) {
let message = concatenateArguments(args);
this.log(`${chalk.bold.yellow("[⚠ WARNING]")} ${message}`);
},
error: function logError(...args) {
let message = concatenateArguments(args);
this.log(`${chalk.bold.red("[✖ ERROR ]")} ${message}`);
},
info: function logInfo(...args) {
let message = concatenateArguments(args);
this.log(`${chalk.bold.cyan("[* info ]")} ${message}`);
},
done: function done(item) {
let {data} = item;
let title, sku, partial;
if (data.brand != null && data.model != null) {
title = `${data.brand} ${data.model}`;
} else if (data.brand != null) {
title = `${data.brand} ${data.title}`;
} else {
title = data.title;
}
if (data.sku != null) {
sku = `(SKU: ${data.sku}) `;
} else {
sku = "";
}
let containsPartialData = itemHasPartialData(item);
if (containsPartialData) {
partial = chalk.cyan("[partial] ");
} else {
partial = "";
}
let message = `${sku}${partial}${title}`;
this.log(`${chalk.bold.green("[✔ done ]")} ${message}`);
if (data.downloads != null) {
data.downloads.forEach((download) => {
this.log(`${chalk.bold.green(" |")} (${download.language}) ${download.description} :: ${download.url}`);
});
}
},
debug: function debug(...args) {
if (process.env.SCRAPER_DEBUG === "1") {
let message = concatenateArguments(args);
this.log(chalk.gray(message));
}
}
};
};

@ -0,0 +1,230 @@
"use strict";
const Promise = require("bluebird");
const bhttp = require("bhttp");
const url = require("url");
const cheerio = require("cheerio");
const defaultValue = require("default-value");
const mergeUrl = require("../merge-url");
const partialData = require("../partial-data");
function ensureValidResponse(response) {
if (response.statusCode !== 200) {
throw new Error(`Encountered non-200 response`);
}
}
let sessionIdRegex = /\/SID-[0-9A-F]{8}-[0-9A-F]{8}\//;
function resolveFromRoot(path) {
return url.resolve("https://www.lidl-service.com/", path);
}
function parseSku(text) {
return text.match(/IAN: ([0-9]+)/)[1];
}
function parseType(text) {
let trimmedText = trimText(text);
if (trimmedText === "Bedienungsanleitung") {
return "manual";
} else if (trimmedText === "Treiber") {
return "driver";
} else if (trimmedText === "Sonstiges") {
/* "Miscellaneous" */
return null;
} else {
throw new Error(`Unrecognized download type: ${trimmedText}`);
}
}
function trimText(text) {
if (text == null) {
return null;
} else {
return text.trim();
}
}
function pageNumberForUrl(targetUrl) {
let parsedUrl = url.parse(targetUrl, { parseQueryString: true });
let pageNumber = parseInt(defaultValue(parsedUrl.query.page, 1));
return pageNumber;
}
module.exports = function createLidlServiceScraper(options) {
let session = bhttp.session({
headers: {
"User-Agent": "Manual scraper (contact/problems: admin@cryto.net)"
}
});
let currentSessionId;
return {
name: "Lidl-Service.com",
initialize: function () {
return Promise.try(() => {
return session.get("https://www.lidl-service.com/");
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
let targetUrl = $("a.de").attr("href");
let sessionId = sessionIdRegex.exec(targetUrl);
if (sessionId == null) {
throw new Error("Did not find expected session ID in URL");
} else {
currentSessionId = sessionId[0];
return resolveFromRoot(targetUrl);
}
});
},
handlers: [
[/&searchType=/, "category", (targetUrl, {description, metadata}) => {
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
/* TODO: Explain this conditional. */
if (metadata.isCategoryIndex) {
return {
urls: [{
description: `${description} - Page 1`,
url: mergeUrl(targetUrl, {
query: {
action: "search",
searchType: "search2",
searchRefresh: "chgPage",
rdeLocaleAttr: "de",
page: "1"
}
})
}]
};
} else {
let $ = cheerio.load(response.body);
let items = $("#product-search-results tbody tr").get().map((row) => {
let item = $(row);
return {
url: resolveFromRoot(item.find("h3 a").attr("href")),
picture: resolveFromRoot(item.find("img").attr("src")),
title: trimText(item.find("h3 a").text()),
sku: parseSku(trimText(item.find(".ian").text())),
description: partialData(trimText(item.find(".col2 ul").html()))
};
});
let currentPageNumber = pageNumberForUrl(targetUrl);
let nextPageButton = $(".page-navigation a.next");
let nextPageUrl;
if (nextPageButton.length > 0) {
nextPageUrl = resolveFromRoot(nextPageButton.attr("href"));
}
let urls = items.map((item) => {
return {
url: item.url,
description: item.title
};
});
if (nextPageUrl != null) {
urls = urls.concat([{
url: nextPageUrl,
description: description.replace(/- Page [0-9]+/, `- Page ${currentPageNumber + 1}`),
sameDepth: true
}]);
}
return {
urls: urls,
items: items
};
}
});
}],
[/\?rdeLocaleAttr=/, "index", (targetUrl) => {
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
let categories = $("#select-product-categories option")
.get()
.map((item) => $(item).attr("value"));
return {
items: [],
urls: categories.map((categoryQuery) => {
return {
description: categoryQuery,
url: mergeUrl(targetUrl, {
query: {
action: "search",
searchType: "search2",
searchText: categoryQuery,
x: 37,
y: 3
}
}),
metadata: {
isCategoryIndex: true
}
};
})
};
})
}],
[/\/product.html/, "product", (targetUrl) => {
/* Product page */
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
return {
items: [{
title: trimText($(".description h1").text()),
sku: parseSku(trimText($(".description .ian").text())),
description: trimText($(".description > ul").html()),
brand: trimText($(".brand-image img").attr("alt")),
downloads: $(".description table a").get().map((element) => {
let link = $(element);
let flagTag = link.closest("table").prev(".table-flag-tag").find("span");
let column = link.closest("td").index();
let typeHeading = link.closest("tbody").prev("thead").find("th").eq(column);
return {
language: flagTag.attr("class"),
type: parseType(typeHeading.text()),
description: link.text(),
url: resolveFromRoot(link.attr("href")),
};
})
}]
}
});
}]
],
updateUrl: function (oldUrl) {
return oldUrl.replace(sessionIdRegex, currentSessionId);
}
}
};

@ -0,0 +1,57 @@
"use strict";
const Promise = require("bluebird");
const promiseTaskQueue = require("promise-task-queue");
const assureArray = require("assure-array");
const createScrapeLogger = require("./scrape-logger");
const normalizeTags = require("./normalize-items");
module.exports = function simpleRunner(createScraper) {
let scraper = createScraper();
let logger = createScrapeLogger({ name: scraper.name });
let queue = promiseTaskQueue();
queue.define("scrape", (task) => {
let {url} = task;
let updatedUrl = scraper.updateUrl(url);
let matchingHandler = scraper.handlers.find(([regex, _handler]) => regex.exec(updatedUrl));
if (matchingHandler == null) {
logger.error(`Scraper does not have a handler for URL in queue: ${updatedUrl}`);
} else {
return Promise.try(() => {
let [_regex, handler] = matchingHandler;
return handler(updatedUrl);
}).then(({urls, items}) => {
if (urls != null) {
urls.forEach((url) => {
logger.debug(`Queueing new URL: ${url}`);
queue.push("scrape", {url: url});
});
}
if (items != null) {
items.forEach((item) => {
let normalizedItem = normalizeTags(item);
logger.done(normalizedItem);
logger.debug(`Encountered item`, normalizedItem);
});
}
return null;
});
}
}, { interval: 0.5 });
return Promise.try(() => {
return scraper.initialize();
}).then((rootUrls) => {
assureArray(rootUrls).forEach((url) => {
queue.push("scrape", {url: url});
});
});
};

@ -0,0 +1,42 @@
'use strict';
module.exports.up = function(knex, Promise) {
return Promise.try(() => {
return knex.schema.createTable("scrapers", (table) => {
table.increments("id").primary();
table.text("name").unique();
});
}).then(() => {
return knex.schema.createTable("urls", (table) => {
table.bigIncrements("id").primary();
table.integer("scraper_id").references("scrapers.id").notNull();
table.bigInteger("parent_id").references("urls.id");
table.integer("depth").notNull();
table.boolean("scraped").notNull().defaultTo(false);
table.text("url").notNull();
table.text("description");
table.jsonb("metadata");
table.timestamp("scraped_at");
});
}).then(() => {
return knex.schema.createTable("items", (table) => {
table.bigIncrements("id").primary();
table.integer("scraper_id").references("scrapers.id").notNull();
table.bigInteger("url_id").references("urls.id").notNull();
table.boolean("processed").notNull().defaultTo(false);
table.boolean("is_partial").notNull();
table.jsonb("data");
table.timestamp("scraped_at");
});
});
}
module.exports.down = function(knex, Promise) {
return Promise.try(() => {
return knex.schema.dropTable("items");
}).then(() => {
return knex.schema.dropTable("urls");
}).then(() => {
return knex.schema.dropTable("scrapers");
});
}

@ -0,0 +1,20 @@
todo:
- lockable roots
- full-size pictures for lidl
- metadata language indicator
- image -> images
- separate result processing pipeline for eg. deduplication
depth-first, because this sidesteps any issues with stuff like category IDs being stored in the session rather than the URL
- mark some URLs as 'must redo when recovering from crash' (from the parent URL list), eg. for category pages
----- categories -----
category index (pagenum 1, categoryIndex true)
page 2 (pagenum 2, categoryIndex false)
page 1 (pagenum 1, categoryIndex false)
page 3 (pagenum 3, categoryIndex false)
page 4 (pagenum 4, categoryIndex false)

@ -0,0 +1,23 @@
{
"name": "manuals-r-us",
"version": "1.0.0",
"main": "index.js",
"repository": "git@git.cryto.net:joepie91/manuals-r-us.git",
"author": "Sven Slootweg <admin@cryto.net>",
"license": "MIT",
"dependencies": {
"assure-array": "^1.0.0",
"bhttp": "^1.2.4",
"bluebird": "^3.5.3",
"chalk": "^2.4.2",
"cheerio": "^1.0.0-rc.2",
"create-error": "^0.3.1",
"database-error": "^2.0.1",
"default-value": "^1.0.0",
"knex": "^0.16.3",
"pg": "^7.9.0",
"promise-task-queue": "^1.2.0",
"promise-while-loop": "^1.0.1",
"surgeon": "^3.13.0"
}
}

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save