You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
203 lines
6.0 KiB
JavaScript
203 lines
6.0 KiB
JavaScript
"use strict";
|
|
|
|
const assert = require("assert");
|
|
const cheerio = require("cheerio");
|
|
const syncpipe = require("syncpipe");
|
|
const url = require("url");
|
|
|
|
const pipe = require("@promistream/pipe");
|
|
const simpleSink = require("@promistream/simple-sink");
|
|
const assureResponse = require("../../shared/assure-response");
|
|
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
|
|
const createDatasheet = require("../../shared/create-datasheet");
|
|
|
|
// TME.eu
|
|
|
|
function firstMatch(options) {
|
|
for (let option of options) {
|
|
if (option != null && option !== "") {
|
|
return option;
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = function ({ session }) {
|
|
return {
|
|
seed: [{
|
|
id: "tme:sitemap:index",
|
|
tags: [ "tme:sitemap" ],
|
|
data: { url: "https://www.tme.eu/en/sitemap.xml" }
|
|
// TODO: Delete derived sitemap entries
|
|
}],
|
|
tags: {
|
|
"tme:sitemap": [ "tme:scrapeSitemap" ],
|
|
"tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ],
|
|
},
|
|
tasks: {
|
|
"tme:scrapeSitemap": {
|
|
ttl: "3d",
|
|
taskInterval: "30s",
|
|
run: async function ({ data, createItem, deleteItem }) {
|
|
let response = await session.get(data.url, { stream: true });
|
|
assureResponse(response);
|
|
|
|
let resultCount = 0;
|
|
|
|
await pipe([
|
|
parseSitemapResponse(response),
|
|
simpleSink((item) => {
|
|
assert(item.url);
|
|
|
|
if (item.type === "sitemap") {
|
|
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
|
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
|
|
createItem({
|
|
id: `tme:sitemap:${item.url}`,
|
|
tags: [ "tme:sitemap" ],
|
|
data: { url: item.url }
|
|
});
|
|
|
|
resultCount += 1;
|
|
}
|
|
} else if (item.type === "url") {
|
|
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
|
|
createItem({
|
|
id: `tme:product:${item.url}`,
|
|
tags: [ "tme:product" ],
|
|
data: { url: item.url }
|
|
});
|
|
|
|
resultCount += 1;
|
|
}
|
|
}
|
|
})
|
|
]).read();
|
|
|
|
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
|
assert(resultCount > 0);
|
|
|
|
// FIXME: Do this here? Or is there a reason *not* to delete the sitemap entry?
|
|
// deleteItem();
|
|
}
|
|
},
|
|
"tme:scrapeProduct": {
|
|
ttl: "60d",
|
|
taskInterval: "500ms",
|
|
run: async function ({ data, createAlias, updateData, expireDependents }) {
|
|
let response = await session.get(data.url);
|
|
assureResponse(response);
|
|
|
|
let $ = cheerio.load(response.body);
|
|
|
|
// FIXME: This is currently broken!
|
|
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [
|
|
(_) => _.map((header) => {
|
|
let $header = $(header);
|
|
|
|
return [
|
|
$header.find(".name").text().trim(),
|
|
$header.find(".value").text().trim()
|
|
];
|
|
}),
|
|
(_) => Object.fromEntries(_)
|
|
]);
|
|
|
|
let descriptionElement = $(".c-pip__description > h2").eq(0);
|
|
|
|
let itemData = {
|
|
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(),
|
|
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(),
|
|
model: firstMatch([
|
|
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
|
|
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
|
|
]),
|
|
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
|
|
? descriptionElement.text().trim()
|
|
: null,
|
|
documents: $("div.c-pip__document > a").toArray()
|
|
.map((link) => {
|
|
let relativeLink = $(link).attr("href");
|
|
|
|
if (relativeLink != null) {
|
|
return {
|
|
description: $(link).text().trim(),
|
|
url: url.resolve(
|
|
data.url,
|
|
relativeLink
|
|
)
|
|
};
|
|
} else {
|
|
// Probably a video popup
|
|
return null;
|
|
}
|
|
})
|
|
.filter((item) => {
|
|
return item != null;
|
|
}),
|
|
// TODO: Scrape prices
|
|
};
|
|
|
|
assert(itemData.productID != null);
|
|
assert(itemData.manufacturer != null);
|
|
assert(itemData.model != null);
|
|
|
|
createAlias({ from: `tme:product:${itemData.productID}` });
|
|
|
|
updateData((oldData) => ({
|
|
... oldData,
|
|
itemData: itemData
|
|
}));
|
|
|
|
expireDependents();
|
|
}
|
|
},
|
|
"tme:normalizeProduct": {
|
|
dependsOn: [ "tme:scrapeProduct" ],
|
|
version: "5",
|
|
parallelTasks: 50,
|
|
run: async function (api) {
|
|
let { data } = api;
|
|
|
|
function isEnglish(document) {
|
|
return /\sen\s*$/i.test(document.description);
|
|
}
|
|
|
|
if (data.itemData.documents.length > 0) {
|
|
if (typeof data.itemData.documents[0] === "string") {
|
|
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
|
|
return;
|
|
}
|
|
|
|
let manufacturer = data.itemData.manufacturer;
|
|
let modelName = data.itemData.model;
|
|
let description = data.itemData.description;
|
|
let productID = data.itemData.productID;
|
|
|
|
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document));
|
|
|
|
let bestDocument = (firstEnglish != null)
|
|
? firstEnglish
|
|
: data.itemData.documents[0];
|
|
|
|
let bestDocumentIsEnglish = isEnglish(bestDocument);
|
|
|
|
createDatasheet(api, {
|
|
priority: (bestDocumentIsEnglish)
|
|
? 0.6
|
|
: 0.5,
|
|
source: "tme",
|
|
manufacturer: manufacturer,
|
|
productID: productID,
|
|
name: modelName,
|
|
description: description,
|
|
url: bestDocument.url,
|
|
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
|
|
fixCasing: true
|
|
});
|
|
}
|
|
}
|
|
},
|
|
}
|
|
};
|
|
};
|