You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

231 lines
5.7 KiB
JavaScript

"use strict";
const Promise = require("bluebird");
const bhttp = require("bhttp");
const url = require("url");
const cheerio = require("cheerio");
const defaultValue = require("default-value");
const mergeUrl = require("../merge-url");
const partialData = require("../partial-data");
function ensureValidResponse(response) {
if (response.statusCode !== 200) {
throw new Error(`Encountered non-200 response`);
}
}
let sessionIdRegex = /\/SID-[0-9A-F]{8}-[0-9A-F]{8}\//;
function resolveFromRoot(path) {
return url.resolve("https://www.lidl-service.com/", path);
}
function parseSku(text) {
return text.match(/IAN: ([0-9]+)/)[1];
}
function parseType(text) {
let trimmedText = trimText(text);
if (trimmedText === "Bedienungsanleitung") {
return "manual";
} else if (trimmedText === "Treiber") {
return "driver";
} else if (trimmedText === "Sonstiges") {
/* "Miscellaneous" */
return null;
} else {
throw new Error(`Unrecognized download type: ${trimmedText}`);
}
}
function trimText(text) {
if (text == null) {
return null;
} else {
return text.trim();
}
}
function pageNumberForUrl(targetUrl) {
let parsedUrl = url.parse(targetUrl, { parseQueryString: true });
let pageNumber = parseInt(defaultValue(parsedUrl.query.page, 1));
return pageNumber;
}
module.exports = function createLidlServiceScraper(options) {
let session = bhttp.session({
headers: {
"User-Agent": "Manual scraper (contact/problems: admin@cryto.net)"
}
});
let currentSessionId;
return {
name: "Lidl-Service.com",
initialize: function () {
return Promise.try(() => {
return session.get("https://www.lidl-service.com/");
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
let targetUrl = $("a.de").attr("href");
let sessionId = sessionIdRegex.exec(targetUrl);
if (sessionId == null) {
throw new Error("Did not find expected session ID in URL");
} else {
currentSessionId = sessionId[0];
return resolveFromRoot(targetUrl);
}
});
},
handlers: [
[/&searchType=/, "category", (targetUrl, {description, metadata}) => {
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
/* TODO: Explain this conditional. */
if (metadata.isCategoryIndex) {
return {
urls: [{
description: `${description} - Page 1`,
url: mergeUrl(targetUrl, {
query: {
action: "search",
searchType: "search2",
searchRefresh: "chgPage",
rdeLocaleAttr: "de",
page: "1"
}
})
}]
};
} else {
let $ = cheerio.load(response.body);
let items = $("#product-search-results tbody tr").get().map((row) => {
let item = $(row);
return {
url: resolveFromRoot(item.find("h3 a").attr("href")),
picture: resolveFromRoot(item.find("img").attr("src")),
title: trimText(item.find("h3 a").text()),
sku: parseSku(trimText(item.find(".ian").text())),
description: partialData(trimText(item.find(".col2 ul").html()))
};
});
let currentPageNumber = pageNumberForUrl(targetUrl);
let nextPageButton = $(".page-navigation a.next");
let nextPageUrl;
if (nextPageButton.length > 0) {
nextPageUrl = resolveFromRoot(nextPageButton.attr("href"));
}
let urls = items.map((item) => {
return {
url: item.url,
description: item.title
};
});
if (nextPageUrl != null) {
urls = urls.concat([{
url: nextPageUrl,
description: description.replace(/- Page [0-9]+/, `- Page ${currentPageNumber + 1}`),
sameDepth: true
}]);
}
return {
urls: urls,
items: items
};
}
});
}],
[/\?rdeLocaleAttr=/, "index", (targetUrl) => {
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
let categories = $("#select-product-categories option")
.get()
.map((item) => $(item).attr("value"));
return {
items: [],
urls: categories.map((categoryQuery) => {
return {
description: categoryQuery,
url: mergeUrl(targetUrl, {
query: {
action: "search",
searchType: "search2",
searchText: categoryQuery,
x: 37,
y: 3
}
}),
metadata: {
isCategoryIndex: true
}
};
})
};
})
}],
[/\/product.html/, "product", (targetUrl) => {
/* Product page */
return Promise.try(() => {
return session.get(targetUrl);
}).then((response) => {
ensureValidResponse(response);
let $ = cheerio.load(response.body);
return {
items: [{
title: trimText($(".description h1").text()),
sku: parseSku(trimText($(".description .ian").text())),
description: trimText($(".description > ul").html()),
brand: trimText($(".brand-image img").attr("alt")),
downloads: $(".description table a").get().map((element) => {
let link = $(element);
let flagTag = link.closest("table").prev(".table-flag-tag").find("span");
let column = link.closest("td").index();
let typeHeading = link.closest("tbody").prev("thead").find("th").eq(column);
return {
language: flagTag.attr("class"),
type: parseType(typeHeading.text()),
description: link.text(),
url: resolveFromRoot(link.attr("href")),
};
})
}]
}
});
}]
],
updateUrl: function (oldUrl) {
return oldUrl.replace(sessionIdRegex, currentSessionId);
}
}
};