First pass at refactoring for modularity

master
Sven Slootweg 1 year ago
parent 6c43818deb
commit d972746f35

@ -10,7 +10,7 @@ const simpleSink = require("@promistream/simple-sink");
const parseSitemap = require("@promistream/parse-sitemap");
const decodeString = require("@promistream/decode-string");
const assureResponse = require("../../shared/assure-response");
const assureResponse = require("../../../lib/shared/assure-response");
let session = bhttp.session({
headers: {

@ -2,6 +2,7 @@
const bhttp = require("bhttp");
const got = require("got");
const mergeSources = require("./lib/merge-sources");
const assureResponse = require("./lib/shared/assure-response");
@ -29,7 +30,7 @@ let state = {
})
};
module.exports = {
let baseSchema = {
backend: "postgresql",
database: {
host: "/run/postgresql",
@ -38,151 +39,15 @@ module.exports = {
max: 75
}
},
seed: [{
id: "st:home",
tags: [ "st:home" ],
data: {}
}, {
id: "lcsc:home",
tags: [ "lcsc:home" ],
data: {}
}, {
id: "mouser:sitemap:index",
tags: [ "mouser:sitemap" ],
data: { url: "https://www.mouser.com/indexgzipwww.xml" }
}, {
id: "tme:sitemap:index",
tags: [ "tme:sitemap" ],
data: { url: "https://www.tme.eu/en/sitemap.xml" }
// TODO: Delete derived sitemap entries
}, {
id: "farnell:sitemap:index",
tags: [ "farnell:sitemap" ],
data: { url: "https://uk.farnell.com/sitemap.xml" }
// TODO: Delete derived sitemap entries
}, {
id: "focus-lcds:home",
tags: [ "focus-lcds:home" ],
data: {}
}],
tags: {
"st:home": [ "st:findCategories" ],
"st:category": [ "st:scrapeCategory" ],
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ],
"lcsc:home": [ "lcsc:findCategories" ],
"lcsc:category": [ "lcsc:scrapeCategory" ],
"lcsc:product": [ "lcsc:normalizeProduct" ],
"tme:sitemap": [ "tme:scrapeSitemap" ],
"tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ],
// "farnell:sitemap": [ "farnell:scrapeSitemap" ],
// "farnell:product": [ "farnell:scrapeProduct", "farnell:normalizeProduct" ],
"focus-lcds:home": [ "focus-lcds:findCategories" ],
"focus-lcds:category": [ "focus-lcds:scrapeCategory" ],
"focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ],
},
tasks: {
// ST Microelectronics
"st:findCategories": {
ttl: "15d",
run: require("./lib/st/task/find-categories")(state)
},
"st:scrapeCategory": {
ttl: "1d",
taskInterval: "60s",
version: "2",
run: require("./lib/st/task/scrape-category")(state)
},
"st:scrapeProduct": {
ttl: "15d",
taskInterval: "5s",
run: require("./lib/st/task/scrape-product")(state)
},
"st:normalizeProduct": {
dependsOn: [ "st:scrapeProduct" ],
version: "8",
parallelTasks: 50,
run: require("./lib/st/task/normalize-product")(state)
},
// LCSC
// FIXME: Commenting out a bunch of tasks but not removing them from the tag assignments will result in an error, but will *not* exit the program. That's probably not right?
"lcsc:findCategories": {
ttl: "30d",
version: "1",
run: require("./lib/lcsc/task/find-categories")(state)
},
"lcsc:scrapeCategory": {
ttl: "30d",
taskInterval: "1m",
run: require("./lib/lcsc/task/scrape-category")(state)
},
"lcsc:normalizeProduct": {
version: "7",
parallelTasks: 50,
run: require("./lib/lcsc/task/normalize-product")(state)
},
// Mouser
"mouser:scrapeSitemap": {
taskInterval: "30s",
run: require("./lib/mouser/task/scrape-sitemap")(state)
},
// TME.eu
"tme:scrapeSitemap": {
ttl: "3d",
taskInterval: "30s",
run: require("./lib/tme/task/scrape-sitemap")(state)
},
"tme:scrapeProduct": {
ttl: "60d",
taskInterval: "500ms",
run: require("./lib/tme/task/scrape-product")(state)
},
"tme:normalizeProduct": {
dependsOn: [ "tme:scrapeProduct" ],
version: "5",
parallelTasks: 50,
run: require("./lib/tme/task/normalize-product")(state)
},
// Farnell
// "farnell:scrapeSitemap": {
// ttl: "3d",
// taskInterval: "30s",
// run: require("./lib/farnell/task/scrape-sitemap")(state)
// },
// "farnell:scrapeProduct": {
// ttl: "60d",
// taskInterval: "500ms",
// run: require("./lib/farnell/task/scrape-product")(state)
// },
// "farnell:normalizeProduct": {
// dependsOn: [ "farnell:scrapeProduct" ],
// version: "1",
// parallelTasks: 50,
// run: require("./lib/farnell/task/normalize-product")(state)
// },
// Focus LCDs
"focus-lcds:findCategories": {
ttl: "60d",
run: require("./lib/focus-lcds/task/find-categories")(state)
},
"focus-lcds:scrapeCategory": {
ttl: "15d",
taskInterval: "1m",
run: require("./lib/focus-lcds/task/scrape-category")(state)
},
"focus-lcds:scrapeProduct": {
ttl: "15d",
taskInterval: "5s",
run: require("./lib/focus-lcds/task/scrape-product")(state)
},
"focus-lcds:normalizeProduct": {
dependsOn: [ "focus-lcds:scrapeProduct" ],
parallelTasks: 50,
run: require("./lib/focus-lcds/task/normalize-product")(state)
},
}
seed: [],
tags: {},
tasks: {}
};
// NOTE: This is *not* currently a fully modular system! Identifiers (tags, task IDs, etc.) are still global to the srap instance as a whole, even though the code exists in different modules. Prefixing identifiers with the scraper they originate from, is still necessary!
module.exports = mergeSources(baseSchema, [
require("./lib/sources/datasheets/lcsc")(state),
require("./lib/sources/datasheets/tme")(state),
require("./lib/sources/datasheets/st")(state),
require("./lib/sources/datasheets/focus-lcds")(state),
]);

@ -1,30 +0,0 @@
"use strict";
const syncpipe = require("syncpipe");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
const surgeon = require("../../shared/surgeon-utils");
const uniqueArray = require("../../shared/unique-array");
module.exports = function findCategories({ session }) {
return async function({ createItem }) {
let response = await session.get("https://focuslcds.com/");
assureResponse(response);
let urls = syncpipe(null, [
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()),
_ => uniqueArray(_),
_ => _.filter((relativeURL) => relativeURL !== ""),
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL))
]);
for (let url of urls) {
createItem({
id: `focus-lcds:category:${url}`,
tags: [ "focus-lcds:category" ],
data: { url: url }
});
}
};
};

@ -1,19 +0,0 @@
"use strict";
const createDatasheet = require("../../shared/create-datasheet");
module.exports = function normalizeProduct() {
return async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.8,
source: "focus-lcds",
manufacturer: data.itemData.manufacturer ?? "Focus LCDs",
productID: null,
name: data.itemData.name,
description: data.itemData.description,
url: data.itemData.datasheetURL
});
};
};

@ -1,33 +0,0 @@
"use strict";
const assureResponse = require("../../shared/assure-response");
const surgeon = require("../../shared/surgeon-utils");
module.exports = function scrapeCategory({ session }) {
return async function({ data, createItem }) {
let response = await session.get(data.url);
assureResponse(response);
let body = response.body.toString();
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body);
if (nextPageURL != null) {
createItem({
id: `focus-lcds:category:${nextPageURL}`,
tags: [ "focus-lcds:category" ],
data: { url: nextPageURL }
});
}
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body);
for (let url of items) {
createItem({
id: `focus-lcds:product:${url}`,
tags: [ "focus-lcds:product" ],
data: { url: url }
});
}
};
};

@ -1,45 +0,0 @@
"use strict";
const url = require("url");
const assureResponse = require("../../shared/assure-response");
const surgeon = require("../../shared/surgeon-utils");
const extractModelNumber = require("../../shared/extract-model-number");
module.exports = function scrapeProduct({ session }) {
return async function({ data, updateData, expireDependents }) {
let response = await session.get(data.url);
assureResponse(response);
let body = response.body.toString();
let itemData = surgeon({
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ],
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`,
description: [
`selectMaybeOne "meta[name='description']" | readAttr content`,
// Get rid of the keyword spam...
(description) => description.split(",")[0]
],
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`,
price: `selectMaybeOne .productView-price .price--withoutTax | text`,
datasheetURL: [
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`,
(relativeURL) => (!/^\/content\/?$/.test(relativeURL))
? url.resolve("https://focuslcds.com/", relativeURL)
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
],
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, {
name: `text`,
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text`
}]
}, body);
updateData((oldData) => ({
... oldData,
itemData: itemData
}));
expireDependents();
};
};

@ -1,42 +0,0 @@
"use strict";
const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
module.exports = function lcscFindCategories(state) {
let { session } = state;
return async function ({ createItem }) {
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category");
assureResponse(response);
assert(response.body.length > 0);
assert(response.statusCode === 200);
function processCategoryEntries(categories) {
for (let category of categories) {
let productCount = category.productNum;
let pageCount = Math.ceil(productCount / 500);
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
for (let i = 1; i <= pageCount; i++) {
createItem({
id: `lcsc:category:${category.catalogId}:page-${i}`,
tags: [ "lcsc:category" ],
data: {
... category,
pageNumber: i
}
});
}
if (category.childCatelogs != null) {
processCategoryEntries(category.childCatelogs);
}
}
}
processCategoryEntries(response.body);
};
};

@ -1,19 +0,0 @@
"use strict";
const createDatasheet = require("../../shared/create-datasheet");
module.exports = function lcscNormalizeProduct() {
return async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.4,
source: "lcsc",
manufacturer: data.brandNameEn,
productID: data.productCode,
name: data.productModel,
description: data.productIntroEn,
url: data.pdfUrl
});
};
};

@ -1,37 +0,0 @@
"use strict";
const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
// TODO: Validate response formats with validatem instead
module.exports = function lcscScrapeCategory(state) {
let { session } = state;
return async function ({ data, createItem, deleteItem, updateData }) {
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, {
catalogIdList: [ data.catalogId ],
currentPage: data.pageNumber,
pageSize: 500,
paramNameValueMap: {}
});
assureResponse(response);
assert(response.statusCode === 200);
assert(response.body.productList != null); // Missing from stale queued requests?
assert(response.body.productList.length > 0);
for (let item of response.body.productList) {
createItem({
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
id: `lcsc:product:${item.productCode}`,
tags: [ "lcsc:product" ],
data: item
});
}
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
deleteItem();
};
};

@ -0,0 +1,21 @@
"use strict";
const mergeByTemplate = require("merge-by-template");
function noOverride(a, b) {
if (a != null && b != null) {
throw new Error(`Property cannot be overridden`);
}
}
let merge = mergeByTemplate.createMerger({
backend: noOverride,
database: noOverride,
seed: [],
tags: mergeByTemplate.anyProperty([]),
tasks: {}
});
module.exports = function mergeSources(base, sources) {
return merge([ base, ... sources ]);
};

@ -267,4 +267,4 @@ module.exports = surgeon.default({
return url.resolve(base, input);
}
}
});;
});

@ -0,0 +1,137 @@
"use strict";
const syncpipe = require("syncpipe");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
const surgeon = require("../../shared/surgeon-utils");
const uniqueArray = require("../../shared/unique-array");
const extractModelNumber = require("../../shared/extract-model-number");
const createDatasheet = require("../../shared/create-datasheet");
// Focus LCDs
module.exports = function ({ session }) {
return {
seed: [{
id: "focus-lcds:home",
tags: [ "focus-lcds:home" ],
data: {}
}],
tags: {
"focus-lcds:home": [ "focus-lcds:findCategories" ],
"focus-lcds:category": [ "focus-lcds:scrapeCategory" ],
"focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ],
},
tasks: {
"focus-lcds:findCategories": {
ttl: "60d",
run: async function({ createItem }) {
let response = await session.get("https://focuslcds.com/");
assureResponse(response);
let urls = syncpipe(null, [
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()),
_ => uniqueArray(_),
_ => _.filter((relativeURL) => relativeURL !== ""),
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL))
]);
for (let url of urls) {
createItem({
id: `focus-lcds:category:${url}`,
tags: [ "focus-lcds:category" ],
data: { url: url }
});
}
}
},
"focus-lcds:scrapeCategory": {
ttl: "15d",
taskInterval: "1m",
run: async function({ data, createItem }) {
let response = await session.get(data.url);
assureResponse(response);
let body = response.body.toString();
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body);
if (nextPageURL != null) {
createItem({
id: `focus-lcds:category:${nextPageURL}`,
tags: [ "focus-lcds:category" ],
data: { url: nextPageURL }
});
}
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body);
for (let url of items) {
createItem({
id: `focus-lcds:product:${url}`,
tags: [ "focus-lcds:product" ],
data: { url: url }
});
}
}
},
"focus-lcds:scrapeProduct": {
ttl: "15d",
taskInterval: "5s",
run: async function({ data, updateData, expireDependents }) {
let response = await session.get(data.url);
assureResponse(response);
let body = response.body.toString();
let itemData = surgeon({
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ],
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`,
description: [
`selectMaybeOne "meta[name='description']" | readAttr content`,
// Get rid of the keyword spam...
(description) => description.split(",")[0]
],
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`,
price: `selectMaybeOne .productView-price .price--withoutTax | text`,
datasheetURL: [
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`,
(relativeURL) => (!/^\/content\/?$/.test(relativeURL))
? url.resolve("https://focuslcds.com/", relativeURL)
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
],
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, {
name: `text`,
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text`
}]
}, body);
updateData((oldData) => ({
... oldData,
itemData: itemData
}));
expireDependents();
}
},
"focus-lcds:normalizeProduct": {
dependsOn: [ "focus-lcds:scrapeProduct" ],
parallelTasks: 50,
run: async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.8,
source: "focus-lcds",
manufacturer: data.itemData.manufacturer ?? "Focus LCDs",
productID: null,
name: data.itemData.name,
description: data.itemData.description,
url: data.itemData.datasheetURL
});
}
},
}
};
};

@ -0,0 +1,108 @@
"use strict";
const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
const createDatasheet = require("../../shared/create-datasheet");
// LCSC
// TODO: Validate response formats with validatem instead
module.exports = function ({ session }) {
return {
seed: [{
id: "lcsc:home",
tags: [ "lcsc:home" ],
data: {}
}],
tags: {
"lcsc:home": [ "lcsc:findCategories" ],
"lcsc:category": [ "lcsc:scrapeCategory" ],
"lcsc:product": [ "lcsc:normalizeProduct" ],
},
tasks: {
"lcsc:findCategories": {
ttl: "30d",
version: "1",
run: async function ({ storeItem }) {
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category");
assureResponse(response);
assert(response.body.length > 0);
assert(response.statusCode === 200);
function processCategoryEntries(categories) {
for (let category of categories) {
let productCount = category.productNum;
let pageCount = Math.ceil(productCount / 500);
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
for (let i = 1; i <= pageCount; i++) {
storeItem({
id: `lcsc:category:${category.catalogId}:page-${i}`,
tags: [ "lcsc:category" ],
data: {
... category,
pageNumber: i
}
});
}
if (category.childCatelogs != null) {
processCategoryEntries(category.childCatelogs);
}
}
}
processCategoryEntries(response.body);
}
},
"lcsc:scrapeCategory": {
ttl: "30d",
taskInterval: "1m",
run: async function ({ data, storeItem, deleteItem }) {
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, {
catalogIdList: [ data.catalogId ],
currentPage: data.pageNumber,
pageSize: 500,
paramNameValueMap: {}
});
assureResponse(response);
assert(response.statusCode === 200);
assert(response.body.productList != null); // Missing from stale queued requests?
assert(response.body.productList.length > 0);
for (let item of response.body.productList) {
storeItem({
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
id: `lcsc:product:${item.productCode}`,
tags: [ "lcsc:product" ],
data: item
});
}
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
deleteItem();
}
},
"lcsc:normalizeProduct": {
version: "7",
parallelTasks: 50,
run: async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.4,
source: "lcsc",
manufacturer: data.brandNameEn,
productID: data.productCode,
name: data.productModel,
description: data.productIntroEn,
url: data.pdfUrl
});
}
},
}
};
};

@ -0,0 +1,211 @@
"use strict";
const cheerio = require("cheerio");
const url = require("url");
const assert = require("assert");
const syncpipe = require("syncpipe");
const htmlEntities = require("html-entities");
const createDatasheet = require("../../shared/create-datasheet");
const assureResponse = require("../../shared/assure-response");
const getUntaggedText = require("../../shared/get-untagged-text");
// ST Microelectronics
function extractID(string) {
// Quick-and-dirty category ID parsing from category pages
let match = /"prmisID":"([^"]+)"/.exec(string);
if (match != null) {
return match[1];
} else {
throw new Error(`ST: prmis ID expected but not found`);
}
}
module.exports = function ({ session }) {
return {
seed: [{
id: "st:home",
tags: [ "st:home" ],
data: {}
}],
tags: {
"st:home": [ "st:findCategories" ],
"st:category": [ "st:scrapeCategory" ],
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ],
},
tasks: {
"st:findCategories": {
ttl: "15d",
run: async function ({ createItem }) {
let response = await session.get("https://www.st.com/content/st_com/en.html");
assureResponse(response);
let $ = cheerio.load(response.body);
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
.toArray()
.map((element) => $(element).attr("href"))
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
for (let link of links) {
createItem({
id: `st:category:${link}`,
tags: [ "st:category" ],
data: { url: link }
});
}
}
},
"st:scrapeCategory": {
ttl: "1d",
taskInterval: "60s",
version: "2",
run: async function({ data, createItem }) {
let response = await session.get(data.url);
assureResponse(response);
let prmisID = extractID(response.body.toString());
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
assureResponse(response);
let listingBuffer = listingResponse.body;
if (listingBuffer.length > 0) {
// This is a category that has a product explorer
let listing = JSON.parse(listingBuffer.toString());
let cellNames = listing.columns.map((column) => {
let cellName = (column.identifier != null)
? `${column.identifier}_${column.qualifier_identifier}`
: `nonstandard:${column.name}:${column.qualifier}`
createItem({
id: `st:column:${cellName}`,
tags: [ "st:column" ],
data: column
});
return cellName;
});
for (let row of listing.rows) {
assert(row.productId != null);
let cellData = syncpipe(row.cells, [
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
(_) => Object.fromEntries(_)
]);
createItem({
id: `st:product:${row.productId}`,
tags: [ "st:product" ],
data: {
... row,
cells: undefined,
cellData: cellData
}
});
}
} else {
console.warn("Warning: empty response, category does not have product explorer");
}
}
},
"st:scrapeProduct": {
ttl: "15d",
taskInterval: "5s",
run: async function({ data, createItem, updateData, expireDependents }) {
if (data.productFolderUrl == null) {
throw new Error(`No known product page URL`);
}
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
let response = await session.get(productPageURL);
assureResponse(response);
let $ = cheerio.load(response.body);
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
let datasheetURL = (datasheetLink != null)
? url.resolve(productPageURL, datasheetLink)
: null;
let resources = $(".st-table--resources")
.find("h3").toArray()
.map((heading) => {
let $heading = $(heading);
let sectionID = $heading.attr("id");
let sectionTitle = $heading.text().trim();
let $table = $heading.next("table");
let items = $table
.find("tbody tr").toArray()
.map((row) => {
let $row = $(row);
let $mainView = $row.find(".visible-on-desktop-only");
let $link = $mainView.find("a.st-link");
return {
url: url.resolve(productPageURL, $link.attr("href")),
documentID: $link.find("span.st-font--bold").text().trim(),
description: $link.find("span:not(.st-font--bold)").text().trim(),
version: getUntaggedText($link),
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
};
});
return {
sectionID: sectionID,
sectionTitle: sectionTitle,
items: items
};
});
updateData((data) => {
return {
... data,
datasheetLink: datasheetURL,
resources: resources
};
});
expireDependents();
for (let section of resources) {
for (let resource of section.items) {
createItem({
id: `st:resource:${resource.url}`,
tags: (resource.url === datasheetURL)
? [ "st:resource", "st:datasheet" ]
: [ "st_resource" ],
data: { url: resource.url }
});
}
}
}
},
"st:normalizeProduct": {
dependsOn: [ "st:scrapeProduct" ],
version: "8",
parallelTasks: 50,
run: async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.8,
source: "st",
manufacturer: "STMicroelectronics",
productID: data.productId,
name: data.cellData["XJE010_VT-007"],
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
url: data.datasheetLink
});
}
},
}
};
};

@ -0,0 +1,202 @@
"use strict";
const assert = require("assert");
const cheerio = require("cheerio");
const syncpipe = require("syncpipe");
const url = require("url");
const pipe = require("@promistream/pipe");
const simpleSink = require("@promistream/simple-sink");
const assureResponse = require("../../shared/assure-response");
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
const createDatasheet = require("../../shared/create-datasheet");
// TME.eu
function firstMatch(options) {
for (let option of options) {
if (option != null && option !== "") {
return option;
}
}
}
module.exports = function ({ session }) {
return {
seed: [{
id: "tme:sitemap:index",
tags: [ "tme:sitemap" ],
data: { url: "https://www.tme.eu/en/sitemap.xml" }
// TODO: Delete derived sitemap entries
}],
tags: {
"tme:sitemap": [ "tme:scrapeSitemap" ],
"tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ],
},
tasks: {
"tme:scrapeSitemap": {
ttl: "3d",
taskInterval: "30s",
run: async function ({ data, createItem, deleteItem }) {
let response = await session.get(data.url, { stream: true });
assureResponse(response);
let resultCount = 0;
await pipe([
parseSitemapResponse(response),
simpleSink((item) => {
assert(item.url);
if (item.type === "sitemap") {
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
createItem({
id: `tme:sitemap:${item.url}`,
tags: [ "tme:sitemap" ],
data: { url: item.url }
});
resultCount += 1;
}
} else if (item.type === "url") {
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
createItem({
id: `tme:product:${item.url}`,
tags: [ "tme:product" ],
data: { url: item.url }
});
resultCount += 1;
}
}
})
]).read();
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
assert(resultCount > 0);
// FIXME: Do this here? Or is there a reason *not* to delete the sitemap entry?
// deleteItem();
}
},
"tme:scrapeProduct": {
ttl: "60d",
taskInterval: "500ms",
run: async function ({ data, createAlias, updateData, expireDependents }) {
let response = await session.get(data.url);
assureResponse(response);
let $ = cheerio.load(response.body);
// FIXME: This is currently broken!
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [
(_) => _.map((header) => {
let $header = $(header);
return [
$header.find(".name").text().trim(),
$header.find(".value").text().trim()
];
}),
(_) => Object.fromEntries(_)
]);
let descriptionElement = $(".c-pip__description > h2").eq(0);
let itemData = {
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(),
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(),
model: firstMatch([
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
]),
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
? descriptionElement.text().trim()
: null,
documents: $("div.c-pip__document > a").toArray()
.map((link) => {
let relativeLink = $(link).attr("href");
if (relativeLink != null) {
return {
description: $(link).text().trim(),
url: url.resolve(
data.url,
relativeLink
)
};
} else {
// Probably a video popup
return null;
}
})
.filter((item) => {
return item != null;
}),
// TODO: Scrape prices
};
assert(itemData.productID != null);
assert(itemData.manufacturer != null);
assert(itemData.model != null);
createAlias({ from: `tme:product:${itemData.productID}` });
updateData((oldData) => ({
... oldData,
itemData: itemData
}));
expireDependents();
}
},
"tme:normalizeProduct": {
dependsOn: [ "tme:scrapeProduct" ],
version: "5",
parallelTasks: 50,
run: async function (api) {
let { data } = api;
function isEnglish(document) {
return /\sen\s*$/i.test(document.description);
}
if (data.itemData.documents.length > 0) {
if (typeof data.itemData.documents[0] === "string") {
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
return;
}
let manufacturer = data.itemData.manufacturer;
let modelName = data.itemData.model;
let description = data.itemData.description;
let productID = data.itemData.productID;
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document));
let bestDocument = (firstEnglish != null)
? firstEnglish
: data.itemData.documents[0];
let bestDocumentIsEnglish = isEnglish(bestDocument);
createDatasheet(api, {
priority: (bestDocumentIsEnglish)
? 0.6
: 0.5,
source: "tme",
manufacturer: manufacturer,
productID: productID,
name: modelName,
description: description,
url: bestDocument.url,
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
fixCasing: true
});
}
}
},
}
};
};

@ -1,12 +0,0 @@
"use strict";
module.exports = function extractID(string) {
// Quick-and-dirty category ID parsing from category pages
let match = /"prmisID":"([^"]+)"/.exec(string);
if (match != null) {
return match[1];
} else {
throw new Error(`ST: prmis ID expected but not found`);
}
};

@ -1,28 +0,0 @@
"use strict";
const cheerio = require("cheerio");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
module.exports = function findCategories({ session }) {
return async function ({ createItem }) {
let response = await session.get("https://www.st.com/content/st_com/en.html");
assureResponse(response);
let $ = cheerio.load(response.body);
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
.toArray()
.map((element) => $(element).attr("href"))
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
for (let link of links) {
createItem({
id: `st:category:${link}`,
tags: [ "st:category" ],
data: { url: link }
});
}
};
};

@ -1,20 +0,0 @@
"use strict";
const htmlEntities = require("html-entities");
const createDatasheet = require("../../shared/create-datasheet");
module.exports = function normalizeProduct() {
return async function (api) {
let { data } = api;
createDatasheet(api, {
priority: 0.8,
source: "st",
manufacturer: "STMicroelectronics",
productID: data.productId,
name: data.cellData["XJE010_VT-007"],
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
url: data.datasheetLink
});
};
};

@ -1,60 +0,0 @@
"use strict";
const assert = require("assert");
const syncpipe = require("syncpipe");
const assureResponse = require("../../shared/assure-response");
const extractId = require("../extract-id");
module.exports = function scrapeCategory({ session }) {
return async function({ data, createItem }) {
let response = await session.get(data.url);
assureResponse(response);
let prmisID = extractId(response.body.toString());
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
assureResponse(response);
let listingBuffer = listingResponse.body;
if (listingBuffer.length > 0) {
// This is a category that has a product explorer
let listing = JSON.parse(listingBuffer.toString());
let cellNames = listing.columns.map((column) => {
let cellName = (column.identifier != null)
? `${column.identifier}_${column.qualifier_identifier}`
: `nonstandard:${column.name}:${column.qualifier}`
createItem({
id: `st:column:${cellName}`,
tags: [ "st:column" ],
data: column
});
return cellName;
});
for (let row of listing.rows) {
assert(row.productId != null);
let cellData = syncpipe(row.cells, [
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
(_) => Object.fromEntries(_)
]);
createItem({
id: `st:product:${row.productId}`,
tags: [ "st:product" ],
data: {
... row,
cells: undefined,
cellData: cellData
}
});
}
} else {
console.warn("Warning: empty response, category does not have product explorer");
}
};
};

@ -1,81 +0,0 @@
"use strict";
const cheerio = require("cheerio");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
const getUntaggedText = require("../../shared/get-untagged-text");
module.exports = function scrapeProduct({ session }) {
return async function({ data, createItem, updateData, expireDependents }) {
if (data.productFolderUrl == null) {
throw new Error(`No known product page URL`);
}
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
let response = await session.get(productPageURL);
assureResponse(response);
let $ = cheerio.load(response.body);
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
let datasheetURL = (datasheetLink != null)
? url.resolve(productPageURL, datasheetLink)
: null;
let resources = $(".st-table--resources")
.find("h3").toArray()
.map((heading) => {
let $heading = $(heading);
let sectionID = $heading.attr("id");
let sectionTitle = $heading.text().trim();
let $table = $heading.next("table");
let items = $table
.find("tbody tr").toArray()
.map((row) => {
let $row = $(row);
let $mainView = $row.find(".visible-on-desktop-only");
let $link = $mainView.find("a.st-link");
return {
url: url.resolve(productPageURL, $link.attr("href")),
documentID: $link.find("span.st-font--bold").text().trim(),
description: $link.find("span:not(.st-font--bold)").text().trim(),
version: getUntaggedText($link),
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
};
});
return {
sectionID: sectionID,
sectionTitle: sectionTitle,
items: items
};
});
updateData((data) => {
return {
... data,
datasheetLink: datasheetURL,
resources: resources
};
});
expireDependents();
for (let section of resources) {
for (let resource of section.items) {
createItem({
id: `st:resource:${resource.url}`,
tags: (resource.url === datasheetURL)
? [ "st:resource", "st:datasheet" ]
: [ "st_resource" ],
data: { url: resource.url }
});
}
}
};
};

@ -1,47 +0,0 @@
"use strict";
const createDatasheet = require("../../shared/create-datasheet");
function isEnglish(document) {
return /\sen\s*$/i.test(document.description);
}
module.exports = function tmeNormalizeProduct() {
return async function (api) {
let { data } = api;
if (data.itemData.documents.length > 0) {
if (typeof data.itemData.documents[0] === "string") {
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
return;
}
let manufacturer = data.itemData.manufacturer;
let modelName = data.itemData.model;
let description = data.itemData.description;
let productID = data.itemData.productID;
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document));
let bestDocument = (firstEnglish != null)
? firstEnglish
: data.itemData.documents[0];
let bestDocumentIsEnglish = isEnglish(bestDocument);
createDatasheet(api, {
priority: (bestDocumentIsEnglish)
? 0.6
: 0.5,
source: "tme",
manufacturer: manufacturer,
productID: productID,
name: modelName,
description: description,
url: bestDocument.url,
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
fixCasing: true
});
}
};
};

@ -1,86 +0,0 @@
"use strict";
const assert = require("assert");
const cheerio = require("cheerio");
const syncpipe = require("syncpipe");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
function firstMatch(options) {
for (let option of options) {
if (option != null && option !== "") {
return option;
}
}
}
module.exports = function tmeScrapeProduct({ session }) {
return async function ({ data, createAlias, updateData, expireDependents }) {
let response = await session.get(data.url);
assureResponse(response);
let $ = cheerio.load(response.body);
// FIXME: This is currently broken!
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [
(_) => _.map((header) => {
let $header = $(header);
return [
$header.find(".name").text().trim(),
$header.find(".value").text().trim()
];
}),
(_) => Object.fromEntries(_)
]);
let descriptionElement = $(".c-pip__description > h2").eq(0);
let itemData = {
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(),
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(),
model: firstMatch([
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
]),
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
? descriptionElement.text().trim()
: null,
documents: $("div.c-pip__document > a").toArray()
.map((link) => {
let relativeLink = $(link).attr("href");
if (relativeLink != null) {
return {
description: $(link).text().trim(),
url: url.resolve(
data.url,
relativeLink
)
};
} else {
// Probably a video popup
return null;
}
})
.filter((item) => {
return item != null;
}),
// TODO: Scrape prices
};
assert(itemData.productID != null);
assert(itemData.manufacturer != null);
assert(itemData.model != null);
createAlias({ from: `tme:product:${itemData.productID}` });
updateData((oldData) => ({
... oldData,
itemData: itemData
}));
expireDependents();
};
};

@ -1,50 +0,0 @@
"use strict";
const assert = require("assert");
const pipe = require("@promistream/pipe");
const simpleSink = require("@promistream/simple-sink");
const assureResponse = require("../../shared/assure-response");
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
module.exports = function tmeScrapeSitemap({ session }) {
return async function ({ data, createItem }) {
let response = await session.get(data.url, { stream: true });
assureResponse(response);
let resultCount = 0;
await pipe([
parseSitemapResponse(response),
simpleSink((item) => {
assert(item.url);
if (item.type === "sitemap") {
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
createItem({
id: `tme:sitemap:${item.url}`,
tags: [ "tme:sitemap" ],
data: { url: item.url }
});
resultCount += 1;
}
} else if (item.type === "url") {
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
createItem({
id: `tme:product:${item.url}`,
tags: [ "tme:product" ],
data: { url: item.url }
});
resultCount += 1;
}
}
})
]).read();
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
assert(resultCount > 0);
};
};

@ -24,12 +24,13 @@
"html-entities": "^2.1.1",
"map-obj": "^4.2.0",
"match-value": "^1.1.0",
"merge-by-template": "^0.1.4",
"pianola": "^2.2.1",
"surgeon": "^3.16.4",
"syncpipe": "^1.0.0"
},
"devDependencies": {
"@joepie91/eslint-config": "^1.1.0",
"@joepie91/eslint-config": "^1.1.1",
"eslint": "^7.22.0"
}
}

@ -0,0 +1,3 @@
"use strict";
console.log(require("fix-esm").require("execall").toString())

@ -0,0 +1,7 @@
- add source: http://www.injoinic.com/product_detail/id/21.html
- LCSC: remove placeholder URLs that aren't actually datasheets
- rework createDatasheet:
- retain alternate options
- track language of each entry (when known)
- also retain non-datasheet documentation
- add source: realtek

@ -263,10 +263,10 @@
resolved "https://registry.yarnpkg.com/@joepie91/consumable/-/consumable-1.0.1.tgz#fd223a481b89b43bfe98687bd7f7ce586826f832"
integrity sha512-LUOoJmFAJ6ocqymtVUiADFvx7T+EFQsfsY6LAOvYBKHlxpWQ/LiQGAi/k5tzATxXpH4/vLC4C9ttRl09/g+HRw==
"@joepie91/eslint-config@^1.1.0":
version "1.1.0"
resolved "https://registry.yarnpkg.com/@joepie91/eslint-config/-/eslint-config-1.1.0.tgz#9397e6ce0a010cb57dcf8aef8754d3a5ce0ae36a"
integrity sha512-XliasRSUfOz1/bAvTBaUlCjWDbceCW4y1DnvFfW7Yw9p2FbNRR0w8WoPdTxTCjKuoZ7/OQMeBxIe2y9Qy6rbYw==
"@joepie91/eslint-config@^1.1.1":
version "1.1.1"
resolved "https://registry.yarnpkg.com/@joepie91/eslint-config/-/eslint-config-1.1.1.tgz#cb276dec6dd25b5777daefbef561850c9717180d"
integrity sha512-q8l83tdpL0YGC24ftlpeHgmQIIRmcpiVhwwEUFPcJ1YXWaee/JjoUs6e5tLKMTNNk+fvDKtq2YPSXkmLQU7h5Q==
"@joepie91/unreachable@^1.0.0":
version "1.0.0"
@ -588,7 +588,7 @@
"@validatem/is-array" "^0.1.0"
"@validatem/validation-result" "^0.1.1"
"@validatem/combinator@^0.1.0", "@validatem/combinator@^0.1.1":
"@validatem/combinator@^0.1.0", "@validatem/combinator@^0.1.1", "@validatem/combinator@^0.1.2":
version "0.1.2"
resolved "https://registry.yarnpkg.com/@validatem/combinator/-/combinator-0.1.2.tgz#eab893d55f1643b9c6857eaf6ff7ed2a728e89ff"
integrity sha512-vE8t1tNXknmN62FlN6LxQmA2c6TwVKZ+fl/Wit3H2unFdOhu7SZj2kRPGjAXdK/ARh/3svYfUBeD75pea0j1Sw==
@ -645,6 +645,32 @@
supports-color "^7.1.0"
syncpipe "^1.0.0"
"@validatem/core@^0.3.3":
version "0.3.17"
resolved "https://registry.yarnpkg.com/@validatem/core/-/core-0.3.17.tgz#1756a7eca0523a3657794d2060273f7d42c083ef"
integrity sha512-VahE9TAKpaU13BcVQI/Dc9j/xsm/BgloRM0v1HjOMpoJ16tOkKQkUdOgiDCG4zmEek1bG3v9Zu4lS1lubgjLMw==
dependencies:
"@validatem/annotate-errors" "^0.1.2"
"@validatem/any-property" "^0.1.0"
"@validatem/error" "^1.0.0"
"@validatem/match-validation-error" "^0.1.0"
"@validatem/match-versioned-special" "^0.1.0"
"@validatem/match-virtual-property" "^0.1.0"
"@validatem/normalize-rules" "^0.1.0"
"@validatem/required" "^0.1.0"
"@validatem/validation-result" "^0.1.1"
"@validatem/virtual-property" "^0.1.0"
as-expression "^1.0.0"
assure-array "^1.0.0"
create-error "^0.3.1"
default-value "^1.0.0"
execall "^2.0.0"
flatten "^1.0.3"
indent-string "^4.0.0"
is-arguments "^1.0.4"
supports-color "^7.1.0"
syncpipe "^1.0.0"
"@validatem/default-to@^0.1.0":
version "0.1.0"
resolved "https://registry.yarnpkg.com/@validatem/default-to/-/default-to-0.1.0.tgz#62766a3ca24d2f61a96c713bcb629a5b3c6427c5"
@ -697,7 +723,7 @@
default-value "^1.0.0"
flatten "^1.0.3"
"@validatem/is-array@^0.1.0":
"@validatem/is-array@^0.1.0", "@validatem/is-array@^0.1.1":
version "0.1.1"
resolved "https://registry.yarnpkg.com/@validatem/is-array/-/is-array-0.1.1.tgz#fbe15ca8c97c30b622a5bbeb536d341e99cfc2c5"
integrity sha512-XD3C+Nqfpnbb4oO//Ufodzvui7SsCIW/stxZ39dP/fyRsBHrdERinkFATH5HepegtDlWMQswm5m1XFRbQiP2oQ==
@ -853,6 +879,15 @@
default-value "^1.0.0"
split-filter-n "^1.1.2"
"@validatem/wrap-path@^0.1.0":
version "0.1.0"
resolved "https://registry.yarnpkg.com/@validatem/wrap-path/-/wrap-path-0.1.0.tgz#777998b62d3e74f2b2897c992dae9b3675161c33"
integrity sha512-6hOqydnr4u8FA0iRv8fyXxsr64T99+w/XL/fixmsgN0uqulEIwGMxCre3y9YkFNcEtysyPHkQl0CrGPcASsZxw==
dependencies:
"@validatem/annotate-errors" "^0.1.2"
"@validatem/combinator" "^0.1.2"
"@validatem/validation-result" "^0.1.2"
"@validatem/wrap-value-as-option@^0.1.0":
version "0.1.0"
resolved "https://registry.yarnpkg.com/@validatem/wrap-value-as-option/-/wrap-value-as-option-0.1.0.tgz#57fa8d535f6cdf40cf8c8846ad45f4dd68f44568"
@ -1969,6 +2004,23 @@ match-value@^1.1.0:
resolved "https://registry.yarnpkg.com/match-value/-/match-value-1.1.0.tgz#ad311ef8bbe2d344a53ec3104e28fe221984b98e"
integrity sha512-NOvpobcmkX+l9Eb6r2s3BkR1g1ZwzExDFdXA9d6p1r1O1olLbo88KuzMiBmg43xSpodfm7I6Hqlx2OoySquEgg==
merge-by-template@^0.1.4:
version "0.1.4"
resolved "https://registry.yarnpkg.com/merge-by-template/-/merge-by-template-0.1.4.tgz#8a03e6383a4e2f2e4a6460bff0d6d3e7b468a535"
integrity sha512-10h5HyGLJJu1F1z02oMqpvMa6oraLr7Vp0gPxlw6Od8xlvzTFr0TQGPZXMLBmZlhZRY910AXGJ6AFc2iXGZ7uQ==
dependencies:
"@validatem/core" "^0.3.3"
"@validatem/default-to" "^0.1.0"
"@validatem/is-array" "^0.1.1"
"@validatem/is-boolean" "^0.1.1"
"@validatem/is-plain-object" "^0.1.1"
"@validatem/remove-nullish-items" "^0.1.0"
"@validatem/virtual-property" "^0.1.0"
"@validatem/wrap-path" "^0.1.0"
default-value "^1.0.0"
fromentries "^1.2.0"
range "^0.0.3"
mime@^1.3.4:
version "1.6.0"
resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1"
@ -2208,6 +2260,11 @@ randexp@0.4.6:
discontinuous-range "1.0.0"
ret "~0.1.10"
range@^0.0.3:
version "0.0.3"
resolved "https://registry.yarnpkg.com/range/-/range-0.0.3.tgz#b5b8eb2463a516b624a563bd32b18fe89e70151b"
integrity sha512-OxK2nY2bmeEB4NxoBraQIBOOeOIxoBvm6yt8MA1kLappgkG3SyLf173iOtT5woWycrtESDD2g0Nl2yt8YPoUnw==
readable-stream@^2.2.2:
version "2.3.7"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.7.tgz#1eca1cf711aef814c04f62252a36a62f6cb23b57"

Loading…
Cancel
Save