You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
231 lines
5.7 KiB
JavaScript
231 lines
5.7 KiB
JavaScript
"use strict";
|
|
|
|
const Promise = require("bluebird");
|
|
const bhttp = require("bhttp");
|
|
const url = require("url");
|
|
const cheerio = require("cheerio");
|
|
const defaultValue = require("default-value");
|
|
|
|
const mergeUrl = require("../merge-url");
|
|
const partialData = require("../partial-data");
|
|
|
|
function ensureValidResponse(response) {
|
|
if (response.statusCode !== 200) {
|
|
throw new Error(`Encountered non-200 response`);
|
|
}
|
|
}
|
|
|
|
let sessionIdRegex = /\/SID-[0-9A-F]{8}-[0-9A-F]{8}\//;
|
|
|
|
function resolveFromRoot(path) {
|
|
return url.resolve("https://www.lidl-service.com/", path);
|
|
}
|
|
|
|
function parseSku(text) {
|
|
return text.match(/IAN: ([0-9]+)/)[1];
|
|
}
|
|
|
|
function parseType(text) {
|
|
let trimmedText = trimText(text);
|
|
|
|
if (trimmedText === "Bedienungsanleitung") {
|
|
return "manual";
|
|
} else if (trimmedText === "Treiber") {
|
|
return "driver";
|
|
} else if (trimmedText === "Sonstiges") {
|
|
/* "Miscellaneous" */
|
|
return null;
|
|
} else {
|
|
throw new Error(`Unrecognized download type: ${trimmedText}`);
|
|
}
|
|
}
|
|
|
|
function trimText(text) {
|
|
if (text == null) {
|
|
return null;
|
|
} else {
|
|
return text.trim();
|
|
}
|
|
}
|
|
|
|
function pageNumberForUrl(targetUrl) {
|
|
let parsedUrl = url.parse(targetUrl, { parseQueryString: true });
|
|
let pageNumber = parseInt(defaultValue(parsedUrl.query.page, 1));
|
|
|
|
return pageNumber;
|
|
}
|
|
|
|
module.exports = function createLidlServiceScraper(options) {
|
|
let session = bhttp.session({
|
|
headers: {
|
|
"User-Agent": "Manual scraper (contact/problems: admin@cryto.net)"
|
|
}
|
|
});
|
|
|
|
let currentSessionId;
|
|
|
|
return {
|
|
name: "Lidl-Service.com",
|
|
initialize: function () {
|
|
return Promise.try(() => {
|
|
return session.get("https://www.lidl-service.com/");
|
|
}).then((response) => {
|
|
ensureValidResponse(response);
|
|
|
|
let $ = cheerio.load(response.body);
|
|
let targetUrl = $("a.de").attr("href");
|
|
|
|
let sessionId = sessionIdRegex.exec(targetUrl);
|
|
|
|
if (sessionId == null) {
|
|
throw new Error("Did not find expected session ID in URL");
|
|
} else {
|
|
currentSessionId = sessionId[0];
|
|
return resolveFromRoot(targetUrl);
|
|
}
|
|
});
|
|
},
|
|
handlers: [
|
|
[/&searchType=/, "category", (targetUrl, {description, metadata}) => {
|
|
return Promise.try(() => {
|
|
return session.get(targetUrl);
|
|
}).then((response) => {
|
|
ensureValidResponse(response);
|
|
|
|
/* TODO: Explain this conditional. */
|
|
if (metadata.isCategoryIndex) {
|
|
return {
|
|
urls: [{
|
|
description: `${description} - Page 1`,
|
|
url: mergeUrl(targetUrl, {
|
|
query: {
|
|
action: "search",
|
|
searchType: "search2",
|
|
searchRefresh: "chgPage",
|
|
rdeLocaleAttr: "de",
|
|
page: "1"
|
|
}
|
|
})
|
|
}]
|
|
};
|
|
} else {
|
|
let $ = cheerio.load(response.body);
|
|
|
|
let items = $("#product-search-results tbody tr").get().map((row) => {
|
|
let item = $(row);
|
|
|
|
return {
|
|
url: resolveFromRoot(item.find("h3 a").attr("href")),
|
|
picture: resolveFromRoot(item.find("img").attr("src")),
|
|
title: trimText(item.find("h3 a").text()),
|
|
sku: parseSku(trimText(item.find(".ian").text())),
|
|
description: partialData(trimText(item.find(".col2 ul").html()))
|
|
};
|
|
});
|
|
|
|
let currentPageNumber = pageNumberForUrl(targetUrl);
|
|
|
|
let nextPageButton = $(".page-navigation a.next");
|
|
let nextPageUrl;
|
|
|
|
if (nextPageButton.length > 0) {
|
|
nextPageUrl = resolveFromRoot(nextPageButton.attr("href"));
|
|
}
|
|
|
|
let urls = items.map((item) => {
|
|
return {
|
|
url: item.url,
|
|
description: item.title
|
|
};
|
|
});
|
|
|
|
if (nextPageUrl != null) {
|
|
urls = urls.concat([{
|
|
url: nextPageUrl,
|
|
description: description.replace(/- Page [0-9]+/, `- Page ${currentPageNumber + 1}`),
|
|
sameDepth: true
|
|
}]);
|
|
}
|
|
|
|
return {
|
|
urls: urls,
|
|
items: items
|
|
};
|
|
}
|
|
});
|
|
}],
|
|
[/\?rdeLocaleAttr=/, "index", (targetUrl) => {
|
|
return Promise.try(() => {
|
|
return session.get(targetUrl);
|
|
}).then((response) => {
|
|
ensureValidResponse(response);
|
|
|
|
let $ = cheerio.load(response.body);
|
|
|
|
let categories = $("#select-product-categories option")
|
|
.get()
|
|
.map((item) => $(item).attr("value"));
|
|
|
|
return {
|
|
items: [],
|
|
urls: categories.map((categoryQuery) => {
|
|
return {
|
|
description: categoryQuery,
|
|
url: mergeUrl(targetUrl, {
|
|
query: {
|
|
action: "search",
|
|
searchType: "search2",
|
|
searchText: categoryQuery,
|
|
x: 37,
|
|
y: 3
|
|
}
|
|
}),
|
|
metadata: {
|
|
isCategoryIndex: true
|
|
}
|
|
};
|
|
})
|
|
};
|
|
})
|
|
}],
|
|
[/\/product.html/, "product", (targetUrl) => {
|
|
/* Product page */
|
|
|
|
return Promise.try(() => {
|
|
return session.get(targetUrl);
|
|
}).then((response) => {
|
|
ensureValidResponse(response);
|
|
|
|
let $ = cheerio.load(response.body);
|
|
|
|
return {
|
|
items: [{
|
|
title: trimText($(".description h1").text()),
|
|
sku: parseSku(trimText($(".description .ian").text())),
|
|
description: trimText($(".description > ul").html()),
|
|
brand: trimText($(".brand-image img").attr("alt")),
|
|
downloads: $(".description table a").get().map((element) => {
|
|
let link = $(element);
|
|
let flagTag = link.closest("table").prev(".table-flag-tag").find("span");
|
|
|
|
let column = link.closest("td").index();
|
|
let typeHeading = link.closest("tbody").prev("thead").find("th").eq(column);
|
|
|
|
return {
|
|
language: flagTag.attr("class"),
|
|
type: parseType(typeHeading.text()),
|
|
description: link.text(),
|
|
url: resolveFromRoot(link.attr("href")),
|
|
};
|
|
})
|
|
}]
|
|
}
|
|
});
|
|
}]
|
|
],
|
|
updateUrl: function (oldUrl) {
|
|
return oldUrl.replace(sessionIdRegex, currentSessionId);
|
|
}
|
|
}
|
|
};
|