"use strict"; const Promise = require("bluebird"); const bhttp = require("bhttp"); const url = require("url"); const cheerio = require("cheerio"); const defaultValue = require("default-value"); const mergeUrl = require("../merge-url"); const partialData = require("../partial-data"); function ensureValidResponse(response) { if (response.statusCode !== 200) { throw new Error(`Encountered non-200 response`); } } let sessionIdRegex = /\/SID-[0-9A-F]{8}-[0-9A-F]{8}\//; function resolveFromRoot(path) { return url.resolve("https://www.lidl-service.com/", path); } function parseSku(text) { return text.match(/IAN: ([0-9]+)/)[1]; } function parseType(text) { let trimmedText = trimText(text); if (trimmedText === "Bedienungsanleitung") { return "manual"; } else if (trimmedText === "Treiber") { return "driver"; } else if (trimmedText === "Sonstiges") { /* "Miscellaneous" */ return null; } else { throw new Error(`Unrecognized download type: ${trimmedText}`); } } function trimText(text) { if (text == null) { return null; } else { return text.trim(); } } function pageNumberForUrl(targetUrl) { let parsedUrl = url.parse(targetUrl, { parseQueryString: true }); let pageNumber = parseInt(defaultValue(parsedUrl.query.page, 1)); return pageNumber; } module.exports = function createLidlServiceScraper(options) { let session = bhttp.session({ headers: { "User-Agent": "Manual scraper (contact/problems: admin@cryto.net)" } }); let currentSessionId; return { name: "Lidl-Service.com", initialize: function () { return Promise.try(() => { return session.get("https://www.lidl-service.com/"); }).then((response) => { ensureValidResponse(response); let $ = cheerio.load(response.body); let targetUrl = $("a.de").attr("href"); let sessionId = sessionIdRegex.exec(targetUrl); if (sessionId == null) { throw new Error("Did not find expected session ID in URL"); } else { currentSessionId = sessionId[0]; return resolveFromRoot(targetUrl); } }); }, handlers: [ [/&searchType=/, "category", (targetUrl, {description, metadata}) => { return Promise.try(() => { return session.get(targetUrl); }).then((response) => { ensureValidResponse(response); /* TODO: Explain this conditional. */ if (metadata.isCategoryIndex) { return { urls: [{ description: `${description} - Page 1`, url: mergeUrl(targetUrl, { query: { action: "search", searchType: "search2", searchRefresh: "chgPage", rdeLocaleAttr: "de", page: "1" } }) }] }; } else { let $ = cheerio.load(response.body); let items = $("#product-search-results tbody tr").get().map((row) => { let item = $(row); return { url: resolveFromRoot(item.find("h3 a").attr("href")), picture: resolveFromRoot(item.find("img").attr("src")), title: trimText(item.find("h3 a").text()), sku: parseSku(trimText(item.find(".ian").text())), description: partialData(trimText(item.find(".col2 ul").html())) }; }); let currentPageNumber = pageNumberForUrl(targetUrl); let nextPageButton = $(".page-navigation a.next"); let nextPageUrl; if (nextPageButton.length > 0) { nextPageUrl = resolveFromRoot(nextPageButton.attr("href")); } let urls = items.map((item) => { return { url: item.url, description: item.title }; }); if (nextPageUrl != null) { urls = urls.concat([{ url: nextPageUrl, description: description.replace(/- Page [0-9]+/, `- Page ${currentPageNumber + 1}`), sameDepth: true }]); } return { urls: urls, items: items }; } }); }], [/\?rdeLocaleAttr=/, "index", (targetUrl) => { return Promise.try(() => { return session.get(targetUrl); }).then((response) => { ensureValidResponse(response); let $ = cheerio.load(response.body); let categories = $("#select-product-categories option") .get() .map((item) => $(item).attr("value")); return { items: [], urls: categories.map((categoryQuery) => { return { description: categoryQuery, url: mergeUrl(targetUrl, { query: { action: "search", searchType: "search2", searchText: categoryQuery, x: 37, y: 3 } }), metadata: { isCategoryIndex: true } }; }) }; }) }], [/\/product.html/, "product", (targetUrl) => { /* Product page */ return Promise.try(() => { return session.get(targetUrl); }).then((response) => { ensureValidResponse(response); let $ = cheerio.load(response.body); return { items: [{ title: trimText($(".description h1").text()), sku: parseSku(trimText($(".description .ian").text())), description: trimText($(".description > ul").html()), brand: trimText($(".brand-image img").attr("alt")), downloads: $(".description table a").get().map((element) => { let link = $(element); let flagTag = link.closest("table").prev(".table-flag-tag").find("span"); let column = link.closest("td").index(); let typeHeading = link.closest("tbody").prev("thead").find("th").eq(column); return { language: flagTag.attr("class"), type: parseType(typeHeading.text()), description: link.text(), url: resolveFromRoot(link.attr("href")), }; }) }] } }); }] ], updateUrl: function (oldUrl) { return oldUrl.replace(sessionIdRegex, currentSessionId); } } };