First pass at refactoring for modularity
parent
6c43818deb
commit
d972746f35
@ -1,30 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const syncpipe = require("syncpipe");
|
|
||||||
const url = require("url");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const surgeon = require("../../shared/surgeon-utils");
|
|
||||||
const uniqueArray = require("../../shared/unique-array");
|
|
||||||
|
|
||||||
module.exports = function findCategories({ session }) {
|
|
||||||
return async function({ createItem }) {
|
|
||||||
let response = await session.get("https://focuslcds.com/");
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let urls = syncpipe(null, [
|
|
||||||
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()),
|
|
||||||
_ => uniqueArray(_),
|
|
||||||
_ => _.filter((relativeURL) => relativeURL !== ""),
|
|
||||||
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL))
|
|
||||||
]);
|
|
||||||
|
|
||||||
for (let url of urls) {
|
|
||||||
createItem({
|
|
||||||
id: `focus-lcds:category:${url}`,
|
|
||||||
tags: [ "focus-lcds:category" ],
|
|
||||||
data: { url: url }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,19 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const createDatasheet = require("../../shared/create-datasheet");
|
|
||||||
|
|
||||||
module.exports = function normalizeProduct() {
|
|
||||||
return async function (api) {
|
|
||||||
let { data } = api;
|
|
||||||
|
|
||||||
createDatasheet(api, {
|
|
||||||
priority: 0.8,
|
|
||||||
source: "focus-lcds",
|
|
||||||
manufacturer: data.itemData.manufacturer ?? "Focus LCDs",
|
|
||||||
productID: null,
|
|
||||||
name: data.itemData.name,
|
|
||||||
description: data.itemData.description,
|
|
||||||
url: data.itemData.datasheetURL
|
|
||||||
});
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,33 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const surgeon = require("../../shared/surgeon-utils");
|
|
||||||
|
|
||||||
module.exports = function scrapeCategory({ session }) {
|
|
||||||
return async function({ data, createItem }) {
|
|
||||||
let response = await session.get(data.url);
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let body = response.body.toString();
|
|
||||||
|
|
||||||
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body);
|
|
||||||
|
|
||||||
if (nextPageURL != null) {
|
|
||||||
createItem({
|
|
||||||
id: `focus-lcds:category:${nextPageURL}`,
|
|
||||||
tags: [ "focus-lcds:category" ],
|
|
||||||
data: { url: nextPageURL }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body);
|
|
||||||
|
|
||||||
for (let url of items) {
|
|
||||||
createItem({
|
|
||||||
id: `focus-lcds:product:${url}`,
|
|
||||||
tags: [ "focus-lcds:product" ],
|
|
||||||
data: { url: url }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,45 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const url = require("url");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const surgeon = require("../../shared/surgeon-utils");
|
|
||||||
const extractModelNumber = require("../../shared/extract-model-number");
|
|
||||||
|
|
||||||
module.exports = function scrapeProduct({ session }) {
|
|
||||||
return async function({ data, updateData, expireDependents }) {
|
|
||||||
let response = await session.get(data.url);
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let body = response.body.toString();
|
|
||||||
|
|
||||||
let itemData = surgeon({
|
|
||||||
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ],
|
|
||||||
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`,
|
|
||||||
description: [
|
|
||||||
`selectMaybeOne "meta[name='description']" | readAttr content`,
|
|
||||||
// Get rid of the keyword spam...
|
|
||||||
(description) => description.split(",")[0]
|
|
||||||
],
|
|
||||||
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`,
|
|
||||||
price: `selectMaybeOne .productView-price .price--withoutTax | text`,
|
|
||||||
datasheetURL: [
|
|
||||||
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`,
|
|
||||||
(relativeURL) => (!/^\/content\/?$/.test(relativeURL))
|
|
||||||
? url.resolve("https://focuslcds.com/", relativeURL)
|
|
||||||
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
|
|
||||||
],
|
|
||||||
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, {
|
|
||||||
name: `text`,
|
|
||||||
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text`
|
|
||||||
}]
|
|
||||||
}, body);
|
|
||||||
|
|
||||||
updateData((oldData) => ({
|
|
||||||
... oldData,
|
|
||||||
itemData: itemData
|
|
||||||
}));
|
|
||||||
|
|
||||||
expireDependents();
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,42 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assert = require("assert");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
|
|
||||||
module.exports = function lcscFindCategories(state) {
|
|
||||||
let { session } = state;
|
|
||||||
|
|
||||||
return async function ({ createItem }) {
|
|
||||||
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category");
|
|
||||||
|
|
||||||
assureResponse(response);
|
|
||||||
assert(response.body.length > 0);
|
|
||||||
assert(response.statusCode === 200);
|
|
||||||
|
|
||||||
function processCategoryEntries(categories) {
|
|
||||||
for (let category of categories) {
|
|
||||||
let productCount = category.productNum;
|
|
||||||
let pageCount = Math.ceil(productCount / 500);
|
|
||||||
|
|
||||||
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
|
|
||||||
for (let i = 1; i <= pageCount; i++) {
|
|
||||||
createItem({
|
|
||||||
id: `lcsc:category:${category.catalogId}:page-${i}`,
|
|
||||||
tags: [ "lcsc:category" ],
|
|
||||||
data: {
|
|
||||||
... category,
|
|
||||||
pageNumber: i
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (category.childCatelogs != null) {
|
|
||||||
processCategoryEntries(category.childCatelogs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
processCategoryEntries(response.body);
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,19 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const createDatasheet = require("../../shared/create-datasheet");
|
|
||||||
|
|
||||||
module.exports = function lcscNormalizeProduct() {
|
|
||||||
return async function (api) {
|
|
||||||
let { data } = api;
|
|
||||||
|
|
||||||
createDatasheet(api, {
|
|
||||||
priority: 0.4,
|
|
||||||
source: "lcsc",
|
|
||||||
manufacturer: data.brandNameEn,
|
|
||||||
productID: data.productCode,
|
|
||||||
name: data.productModel,
|
|
||||||
description: data.productIntroEn,
|
|
||||||
url: data.pdfUrl
|
|
||||||
});
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,37 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assert = require("assert");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
|
|
||||||
// TODO: Validate response formats with validatem instead
|
|
||||||
|
|
||||||
module.exports = function lcscScrapeCategory(state) {
|
|
||||||
let { session } = state;
|
|
||||||
|
|
||||||
return async function ({ data, createItem, deleteItem, updateData }) {
|
|
||||||
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, {
|
|
||||||
catalogIdList: [ data.catalogId ],
|
|
||||||
currentPage: data.pageNumber,
|
|
||||||
pageSize: 500,
|
|
||||||
paramNameValueMap: {}
|
|
||||||
});
|
|
||||||
|
|
||||||
assureResponse(response);
|
|
||||||
assert(response.statusCode === 200);
|
|
||||||
assert(response.body.productList != null); // Missing from stale queued requests?
|
|
||||||
assert(response.body.productList.length > 0);
|
|
||||||
|
|
||||||
for (let item of response.body.productList) {
|
|
||||||
createItem({
|
|
||||||
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
|
|
||||||
id: `lcsc:product:${item.productCode}`,
|
|
||||||
tags: [ "lcsc:product" ],
|
|
||||||
data: item
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
|
|
||||||
deleteItem();
|
|
||||||
};
|
|
||||||
};
|
|
@ -0,0 +1,21 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const mergeByTemplate = require("merge-by-template");
|
||||||
|
|
||||||
|
function noOverride(a, b) {
|
||||||
|
if (a != null && b != null) {
|
||||||
|
throw new Error(`Property cannot be overridden`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let merge = mergeByTemplate.createMerger({
|
||||||
|
backend: noOverride,
|
||||||
|
database: noOverride,
|
||||||
|
seed: [],
|
||||||
|
tags: mergeByTemplate.anyProperty([]),
|
||||||
|
tasks: {}
|
||||||
|
});
|
||||||
|
|
||||||
|
module.exports = function mergeSources(base, sources) {
|
||||||
|
return merge([ base, ... sources ]);
|
||||||
|
};
|
@ -0,0 +1,137 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const syncpipe = require("syncpipe");
|
||||||
|
const url = require("url");
|
||||||
|
|
||||||
|
const assureResponse = require("../../shared/assure-response");
|
||||||
|
const surgeon = require("../../shared/surgeon-utils");
|
||||||
|
const uniqueArray = require("../../shared/unique-array");
|
||||||
|
const extractModelNumber = require("../../shared/extract-model-number");
|
||||||
|
const createDatasheet = require("../../shared/create-datasheet");
|
||||||
|
|
||||||
|
// Focus LCDs
|
||||||
|
|
||||||
|
module.exports = function ({ session }) {
|
||||||
|
return {
|
||||||
|
seed: [{
|
||||||
|
id: "focus-lcds:home",
|
||||||
|
tags: [ "focus-lcds:home" ],
|
||||||
|
data: {}
|
||||||
|
}],
|
||||||
|
tags: {
|
||||||
|
"focus-lcds:home": [ "focus-lcds:findCategories" ],
|
||||||
|
"focus-lcds:category": [ "focus-lcds:scrapeCategory" ],
|
||||||
|
"focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ],
|
||||||
|
},
|
||||||
|
tasks: {
|
||||||
|
"focus-lcds:findCategories": {
|
||||||
|
ttl: "60d",
|
||||||
|
run: async function({ createItem }) {
|
||||||
|
let response = await session.get("https://focuslcds.com/");
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let urls = syncpipe(null, [
|
||||||
|
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()),
|
||||||
|
_ => uniqueArray(_),
|
||||||
|
_ => _.filter((relativeURL) => relativeURL !== ""),
|
||||||
|
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL))
|
||||||
|
]);
|
||||||
|
|
||||||
|
for (let url of urls) {
|
||||||
|
createItem({
|
||||||
|
id: `focus-lcds:category:${url}`,
|
||||||
|
tags: [ "focus-lcds:category" ],
|
||||||
|
data: { url: url }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"focus-lcds:scrapeCategory": {
|
||||||
|
ttl: "15d",
|
||||||
|
taskInterval: "1m",
|
||||||
|
run: async function({ data, createItem }) {
|
||||||
|
let response = await session.get(data.url);
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let body = response.body.toString();
|
||||||
|
|
||||||
|
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body);
|
||||||
|
|
||||||
|
if (nextPageURL != null) {
|
||||||
|
createItem({
|
||||||
|
id: `focus-lcds:category:${nextPageURL}`,
|
||||||
|
tags: [ "focus-lcds:category" ],
|
||||||
|
data: { url: nextPageURL }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body);
|
||||||
|
|
||||||
|
for (let url of items) {
|
||||||
|
createItem({
|
||||||
|
id: `focus-lcds:product:${url}`,
|
||||||
|
tags: [ "focus-lcds:product" ],
|
||||||
|
data: { url: url }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"focus-lcds:scrapeProduct": {
|
||||||
|
ttl: "15d",
|
||||||
|
taskInterval: "5s",
|
||||||
|
run: async function({ data, updateData, expireDependents }) {
|
||||||
|
let response = await session.get(data.url);
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let body = response.body.toString();
|
||||||
|
|
||||||
|
let itemData = surgeon({
|
||||||
|
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ],
|
||||||
|
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`,
|
||||||
|
description: [
|
||||||
|
`selectMaybeOne "meta[name='description']" | readAttr content`,
|
||||||
|
// Get rid of the keyword spam...
|
||||||
|
(description) => description.split(",")[0]
|
||||||
|
],
|
||||||
|
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`,
|
||||||
|
price: `selectMaybeOne .productView-price .price--withoutTax | text`,
|
||||||
|
datasheetURL: [
|
||||||
|
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`,
|
||||||
|
(relativeURL) => (!/^\/content\/?$/.test(relativeURL))
|
||||||
|
? url.resolve("https://focuslcds.com/", relativeURL)
|
||||||
|
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
|
||||||
|
],
|
||||||
|
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, {
|
||||||
|
name: `text`,
|
||||||
|
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text`
|
||||||
|
}]
|
||||||
|
}, body);
|
||||||
|
|
||||||
|
updateData((oldData) => ({
|
||||||
|
... oldData,
|
||||||
|
itemData: itemData
|
||||||
|
}));
|
||||||
|
|
||||||
|
expireDependents();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"focus-lcds:normalizeProduct": {
|
||||||
|
dependsOn: [ "focus-lcds:scrapeProduct" ],
|
||||||
|
parallelTasks: 50,
|
||||||
|
run: async function (api) {
|
||||||
|
let { data } = api;
|
||||||
|
|
||||||
|
createDatasheet(api, {
|
||||||
|
priority: 0.8,
|
||||||
|
source: "focus-lcds",
|
||||||
|
manufacturer: data.itemData.manufacturer ?? "Focus LCDs",
|
||||||
|
productID: null,
|
||||||
|
name: data.itemData.name,
|
||||||
|
description: data.itemData.description,
|
||||||
|
url: data.itemData.datasheetURL
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
@ -0,0 +1,108 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const assert = require("assert");
|
||||||
|
|
||||||
|
const assureResponse = require("../../shared/assure-response");
|
||||||
|
const createDatasheet = require("../../shared/create-datasheet");
|
||||||
|
|
||||||
|
// LCSC
|
||||||
|
// TODO: Validate response formats with validatem instead
|
||||||
|
|
||||||
|
module.exports = function ({ session }) {
|
||||||
|
return {
|
||||||
|
seed: [{
|
||||||
|
id: "lcsc:home",
|
||||||
|
tags: [ "lcsc:home" ],
|
||||||
|
data: {}
|
||||||
|
}],
|
||||||
|
tags: {
|
||||||
|
"lcsc:home": [ "lcsc:findCategories" ],
|
||||||
|
"lcsc:category": [ "lcsc:scrapeCategory" ],
|
||||||
|
"lcsc:product": [ "lcsc:normalizeProduct" ],
|
||||||
|
},
|
||||||
|
tasks: {
|
||||||
|
"lcsc:findCategories": {
|
||||||
|
ttl: "30d",
|
||||||
|
version: "1",
|
||||||
|
run: async function ({ storeItem }) {
|
||||||
|
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category");
|
||||||
|
|
||||||
|
assureResponse(response);
|
||||||
|
assert(response.body.length > 0);
|
||||||
|
assert(response.statusCode === 200);
|
||||||
|
|
||||||
|
function processCategoryEntries(categories) {
|
||||||
|
for (let category of categories) {
|
||||||
|
let productCount = category.productNum;
|
||||||
|
let pageCount = Math.ceil(productCount / 500);
|
||||||
|
|
||||||
|
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
|
||||||
|
for (let i = 1; i <= pageCount; i++) {
|
||||||
|
storeItem({
|
||||||
|
id: `lcsc:category:${category.catalogId}:page-${i}`,
|
||||||
|
tags: [ "lcsc:category" ],
|
||||||
|
data: {
|
||||||
|
... category,
|
||||||
|
pageNumber: i
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (category.childCatelogs != null) {
|
||||||
|
processCategoryEntries(category.childCatelogs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
processCategoryEntries(response.body);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lcsc:scrapeCategory": {
|
||||||
|
ttl: "30d",
|
||||||
|
taskInterval: "1m",
|
||||||
|
run: async function ({ data, storeItem, deleteItem }) {
|
||||||
|
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, {
|
||||||
|
catalogIdList: [ data.catalogId ],
|
||||||
|
currentPage: data.pageNumber,
|
||||||
|
pageSize: 500,
|
||||||
|
paramNameValueMap: {}
|
||||||
|
});
|
||||||
|
|
||||||
|
assureResponse(response);
|
||||||
|
assert(response.statusCode === 200);
|
||||||
|
assert(response.body.productList != null); // Missing from stale queued requests?
|
||||||
|
assert(response.body.productList.length > 0);
|
||||||
|
|
||||||
|
for (let item of response.body.productList) {
|
||||||
|
storeItem({
|
||||||
|
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
|
||||||
|
id: `lcsc:product:${item.productCode}`,
|
||||||
|
tags: [ "lcsc:product" ],
|
||||||
|
data: item
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
|
||||||
|
deleteItem();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lcsc:normalizeProduct": {
|
||||||
|
version: "7",
|
||||||
|
parallelTasks: 50,
|
||||||
|
run: async function (api) {
|
||||||
|
let { data } = api;
|
||||||
|
|
||||||
|
createDatasheet(api, {
|
||||||
|
priority: 0.4,
|
||||||
|
source: "lcsc",
|
||||||
|
manufacturer: data.brandNameEn,
|
||||||
|
productID: data.productCode,
|
||||||
|
name: data.productModel,
|
||||||
|
description: data.productIntroEn,
|
||||||
|
url: data.pdfUrl
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
@ -0,0 +1,211 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const cheerio = require("cheerio");
|
||||||
|
const url = require("url");
|
||||||
|
const assert = require("assert");
|
||||||
|
const syncpipe = require("syncpipe");
|
||||||
|
const htmlEntities = require("html-entities");
|
||||||
|
|
||||||
|
const createDatasheet = require("../../shared/create-datasheet");
|
||||||
|
const assureResponse = require("../../shared/assure-response");
|
||||||
|
const getUntaggedText = require("../../shared/get-untagged-text");
|
||||||
|
|
||||||
|
// ST Microelectronics
|
||||||
|
|
||||||
|
function extractID(string) {
|
||||||
|
// Quick-and-dirty category ID parsing from category pages
|
||||||
|
let match = /"prmisID":"([^"]+)"/.exec(string);
|
||||||
|
|
||||||
|
if (match != null) {
|
||||||
|
return match[1];
|
||||||
|
} else {
|
||||||
|
throw new Error(`ST: prmis ID expected but not found`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = function ({ session }) {
|
||||||
|
return {
|
||||||
|
seed: [{
|
||||||
|
id: "st:home",
|
||||||
|
tags: [ "st:home" ],
|
||||||
|
data: {}
|
||||||
|
}],
|
||||||
|
tags: {
|
||||||
|
"st:home": [ "st:findCategories" ],
|
||||||
|
"st:category": [ "st:scrapeCategory" ],
|
||||||
|
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ],
|
||||||
|
},
|
||||||
|
tasks: {
|
||||||
|
"st:findCategories": {
|
||||||
|
ttl: "15d",
|
||||||
|
run: async function ({ createItem }) {
|
||||||
|
let response = await session.get("https://www.st.com/content/st_com/en.html");
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let $ = cheerio.load(response.body);
|
||||||
|
|
||||||
|
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
|
||||||
|
.toArray()
|
||||||
|
.map((element) => $(element).attr("href"))
|
||||||
|
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
|
||||||
|
|
||||||
|
for (let link of links) {
|
||||||
|
createItem({
|
||||||
|
id: `st:category:${link}`,
|
||||||
|
tags: [ "st:category" ],
|
||||||
|
data: { url: link }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"st:scrapeCategory": {
|
||||||
|
ttl: "1d",
|
||||||
|
taskInterval: "60s",
|
||||||
|
version: "2",
|
||||||
|
run: async function({ data, createItem }) {
|
||||||
|
let response = await session.get(data.url);
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let prmisID = extractID(response.body.toString());
|
||||||
|
|
||||||
|
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
|
||||||
|
assureResponse(response);
|
||||||
|
let listingBuffer = listingResponse.body;
|
||||||
|
|
||||||
|
if (listingBuffer.length > 0) {
|
||||||
|
// This is a category that has a product explorer
|
||||||
|
let listing = JSON.parse(listingBuffer.toString());
|
||||||
|
|
||||||
|
let cellNames = listing.columns.map((column) => {
|
||||||
|
let cellName = (column.identifier != null)
|
||||||
|
? `${column.identifier}_${column.qualifier_identifier}`
|
||||||
|
: `nonstandard:${column.name}:${column.qualifier}`
|
||||||
|
|
||||||
|
createItem({
|
||||||
|
id: `st:column:${cellName}`,
|
||||||
|
tags: [ "st:column" ],
|
||||||
|
data: column
|
||||||
|
});
|
||||||
|
|
||||||
|
return cellName;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let row of listing.rows) {
|
||||||
|
assert(row.productId != null);
|
||||||
|
|
||||||
|
let cellData = syncpipe(row.cells, [
|
||||||
|
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
|
||||||
|
(_) => Object.fromEntries(_)
|
||||||
|
]);
|
||||||
|
|
||||||
|
createItem({
|
||||||
|
id: `st:product:${row.productId}`,
|
||||||
|
tags: [ "st:product" ],
|
||||||
|
data: {
|
||||||
|
... row,
|
||||||
|
cells: undefined,
|
||||||
|
cellData: cellData
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.warn("Warning: empty response, category does not have product explorer");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"st:scrapeProduct": {
|
||||||
|
ttl: "15d",
|
||||||
|
taskInterval: "5s",
|
||||||
|
run: async function({ data, createItem, updateData, expireDependents }) {
|
||||||
|
if (data.productFolderUrl == null) {
|
||||||
|
throw new Error(`No known product page URL`);
|
||||||
|
}
|
||||||
|
|
||||||
|
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
|
||||||
|
|
||||||
|
let response = await session.get(productPageURL);
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let $ = cheerio.load(response.body);
|
||||||
|
|
||||||
|
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
|
||||||
|
let datasheetURL = (datasheetLink != null)
|
||||||
|
? url.resolve(productPageURL, datasheetLink)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
let resources = $(".st-table--resources")
|
||||||
|
.find("h3").toArray()
|
||||||
|
.map((heading) => {
|
||||||
|
let $heading = $(heading);
|
||||||
|
let sectionID = $heading.attr("id");
|
||||||
|
let sectionTitle = $heading.text().trim();
|
||||||
|
|
||||||
|
let $table = $heading.next("table");
|
||||||
|
|
||||||
|
let items = $table
|
||||||
|
.find("tbody tr").toArray()
|
||||||
|
.map((row) => {
|
||||||
|
let $row = $(row);
|
||||||
|
let $mainView = $row.find(".visible-on-desktop-only");
|
||||||
|
let $link = $mainView.find("a.st-link");
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: url.resolve(productPageURL, $link.attr("href")),
|
||||||
|
documentID: $link.find("span.st-font--bold").text().trim(),
|
||||||
|
description: $link.find("span:not(.st-font--bold)").text().trim(),
|
||||||
|
version: getUntaggedText($link),
|
||||||
|
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
sectionID: sectionID,
|
||||||
|
sectionTitle: sectionTitle,
|
||||||
|
items: items
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
updateData((data) => {
|
||||||
|
return {
|
||||||
|
... data,
|
||||||
|
datasheetLink: datasheetURL,
|
||||||
|
resources: resources
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
expireDependents();
|
||||||
|
|
||||||
|
for (let section of resources) {
|
||||||
|
for (let resource of section.items) {
|
||||||
|
createItem({
|
||||||
|
id: `st:resource:${resource.url}`,
|
||||||
|
tags: (resource.url === datasheetURL)
|
||||||
|
? [ "st:resource", "st:datasheet" ]
|
||||||
|
: [ "st_resource" ],
|
||||||
|
data: { url: resource.url }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"st:normalizeProduct": {
|
||||||
|
dependsOn: [ "st:scrapeProduct" ],
|
||||||
|
version: "8",
|
||||||
|
parallelTasks: 50,
|
||||||
|
run: async function (api) {
|
||||||
|
let { data } = api;
|
||||||
|
|
||||||
|
createDatasheet(api, {
|
||||||
|
priority: 0.8,
|
||||||
|
source: "st",
|
||||||
|
manufacturer: "STMicroelectronics",
|
||||||
|
productID: data.productId,
|
||||||
|
name: data.cellData["XJE010_VT-007"],
|
||||||
|
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
|
||||||
|
url: data.datasheetLink
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
@ -0,0 +1,202 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const assert = require("assert");
|
||||||
|
const cheerio = require("cheerio");
|
||||||
|
const syncpipe = require("syncpipe");
|
||||||
|
const url = require("url");
|
||||||
|
|
||||||
|
const pipe = require("@promistream/pipe");
|
||||||
|
const simpleSink = require("@promistream/simple-sink");
|
||||||
|
const assureResponse = require("../../shared/assure-response");
|
||||||
|
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
|
||||||
|
const createDatasheet = require("../../shared/create-datasheet");
|
||||||
|
|
||||||
|
// TME.eu
|
||||||
|
|
||||||
|
function firstMatch(options) {
|
||||||
|
for (let option of options) {
|
||||||
|
if (option != null && option !== "") {
|
||||||
|
return option;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = function ({ session }) {
|
||||||
|
return {
|
||||||
|
seed: [{
|
||||||
|
id: "tme:sitemap:index",
|
||||||
|
tags: [ "tme:sitemap" ],
|
||||||
|
data: { url: "https://www.tme.eu/en/sitemap.xml" }
|
||||||
|
// TODO: Delete derived sitemap entries
|
||||||
|
}],
|
||||||
|
tags: {
|
||||||
|
"tme:sitemap": [ "tme:scrapeSitemap" ],
|
||||||
|
"tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ],
|
||||||
|
},
|
||||||
|
tasks: {
|
||||||
|
"tme:scrapeSitemap": {
|
||||||
|
ttl: "3d",
|
||||||
|
taskInterval: "30s",
|
||||||
|
run: async function ({ data, createItem, deleteItem }) {
|
||||||
|
let response = await session.get(data.url, { stream: true });
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let resultCount = 0;
|
||||||
|
|
||||||
|
await pipe([
|
||||||
|
parseSitemapResponse(response),
|
||||||
|
simpleSink((item) => {
|
||||||
|
assert(item.url);
|
||||||
|
|
||||||
|
if (item.type === "sitemap") {
|
||||||
|
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
||||||
|
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
|
||||||
|
createItem({
|
||||||
|
id: `tme:sitemap:${item.url}`,
|
||||||
|
tags: [ "tme:sitemap" ],
|
||||||
|
data: { url: item.url }
|
||||||
|
});
|
||||||
|
|
||||||
|
resultCount += 1;
|
||||||
|
}
|
||||||
|
} else if (item.type === "url") {
|
||||||
|
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
|
||||||
|
createItem({
|
||||||
|
id: `tme:product:${item.url}`,
|
||||||
|
tags: [ "tme:product" ],
|
||||||
|
data: { url: item.url }
|
||||||
|
});
|
||||||
|
|
||||||
|
resultCount += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
]).read();
|
||||||
|
|
||||||
|
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
||||||
|
assert(resultCount > 0);
|
||||||
|
|
||||||
|
// FIXME: Do this here? Or is there a reason *not* to delete the sitemap entry?
|
||||||
|
// deleteItem();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tme:scrapeProduct": {
|
||||||
|
ttl: "60d",
|
||||||
|
taskInterval: "500ms",
|
||||||
|
run: async function ({ data, createAlias, updateData, expireDependents }) {
|
||||||
|
let response = await session.get(data.url);
|
||||||
|
assureResponse(response);
|
||||||
|
|
||||||
|
let $ = cheerio.load(response.body);
|
||||||
|
|
||||||
|
// FIXME: This is currently broken!
|
||||||
|
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [
|
||||||
|
(_) => _.map((header) => {
|
||||||
|
let $header = $(header);
|
||||||
|
|
||||||
|
return [
|
||||||
|
$header.find(".name").text().trim(),
|
||||||
|
$header.find(".value").text().trim()
|
||||||
|
];
|
||||||
|
}),
|
||||||
|
(_) => Object.fromEntries(_)
|
||||||
|
]);
|
||||||
|
|
||||||
|
let descriptionElement = $(".c-pip__description > h2").eq(0);
|
||||||
|
|
||||||
|
let itemData = {
|
||||||
|
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(),
|
||||||
|
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(),
|
||||||
|
model: firstMatch([
|
||||||
|
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
|
||||||
|
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
|
||||||
|
]),
|
||||||
|
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
|
||||||
|
? descriptionElement.text().trim()
|
||||||
|
: null,
|
||||||
|
documents: $("div.c-pip__document > a").toArray()
|
||||||
|
.map((link) => {
|
||||||
|
let relativeLink = $(link).attr("href");
|
||||||
|
|
||||||
|
if (relativeLink != null) {
|
||||||
|
return {
|
||||||
|
description: $(link).text().trim(),
|
||||||
|
url: url.resolve(
|
||||||
|
data.url,
|
||||||
|
relativeLink
|
||||||
|
)
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
// Probably a video popup
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter((item) => {
|
||||||
|
return item != null;
|
||||||
|
}),
|
||||||
|
// TODO: Scrape prices
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(itemData.productID != null);
|
||||||
|
assert(itemData.manufacturer != null);
|
||||||
|
assert(itemData.model != null);
|
||||||
|
|
||||||
|
createAlias({ from: `tme:product:${itemData.productID}` });
|
||||||
|
|
||||||
|
updateData((oldData) => ({
|
||||||
|
... oldData,
|
||||||
|
itemData: itemData
|
||||||
|
}));
|
||||||
|
|
||||||
|
expireDependents();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tme:normalizeProduct": {
|
||||||
|
dependsOn: [ "tme:scrapeProduct" ],
|
||||||
|
version: "5",
|
||||||
|
parallelTasks: 50,
|
||||||
|
run: async function (api) {
|
||||||
|
let { data } = api;
|
||||||
|
|
||||||
|
function isEnglish(document) {
|
||||||
|
return /\sen\s*$/i.test(document.description);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.itemData.documents.length > 0) {
|
||||||
|
if (typeof data.itemData.documents[0] === "string") {
|
||||||
|
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let manufacturer = data.itemData.manufacturer;
|
||||||
|
let modelName = data.itemData.model;
|
||||||
|
let description = data.itemData.description;
|
||||||
|
let productID = data.itemData.productID;
|
||||||
|
|
||||||
|
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document));
|
||||||
|
|
||||||
|
let bestDocument = (firstEnglish != null)
|
||||||
|
? firstEnglish
|
||||||
|
: data.itemData.documents[0];
|
||||||
|
|
||||||
|
let bestDocumentIsEnglish = isEnglish(bestDocument);
|
||||||
|
|
||||||
|
createDatasheet(api, {
|
||||||
|
priority: (bestDocumentIsEnglish)
|
||||||
|
? 0.6
|
||||||
|
: 0.5,
|
||||||
|
source: "tme",
|
||||||
|
manufacturer: manufacturer,
|
||||||
|
productID: productID,
|
||||||
|
name: modelName,
|
||||||
|
description: description,
|
||||||
|
url: bestDocument.url,
|
||||||
|
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
|
||||||
|
fixCasing: true
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
@ -1,12 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
module.exports = function extractID(string) {
|
|
||||||
// Quick-and-dirty category ID parsing from category pages
|
|
||||||
let match = /"prmisID":"([^"]+)"/.exec(string);
|
|
||||||
|
|
||||||
if (match != null) {
|
|
||||||
return match[1];
|
|
||||||
} else {
|
|
||||||
throw new Error(`ST: prmis ID expected but not found`);
|
|
||||||
}
|
|
||||||
};
|
|
@ -1,28 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const cheerio = require("cheerio");
|
|
||||||
const url = require("url");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
|
|
||||||
module.exports = function findCategories({ session }) {
|
|
||||||
return async function ({ createItem }) {
|
|
||||||
let response = await session.get("https://www.st.com/content/st_com/en.html");
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let $ = cheerio.load(response.body);
|
|
||||||
|
|
||||||
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
|
|
||||||
.toArray()
|
|
||||||
.map((element) => $(element).attr("href"))
|
|
||||||
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
|
|
||||||
|
|
||||||
for (let link of links) {
|
|
||||||
createItem({
|
|
||||||
id: `st:category:${link}`,
|
|
||||||
tags: [ "st:category" ],
|
|
||||||
data: { url: link }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,20 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const htmlEntities = require("html-entities");
|
|
||||||
const createDatasheet = require("../../shared/create-datasheet");
|
|
||||||
|
|
||||||
module.exports = function normalizeProduct() {
|
|
||||||
return async function (api) {
|
|
||||||
let { data } = api;
|
|
||||||
|
|
||||||
createDatasheet(api, {
|
|
||||||
priority: 0.8,
|
|
||||||
source: "st",
|
|
||||||
manufacturer: "STMicroelectronics",
|
|
||||||
productID: data.productId,
|
|
||||||
name: data.cellData["XJE010_VT-007"],
|
|
||||||
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
|
|
||||||
url: data.datasheetLink
|
|
||||||
});
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,60 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assert = require("assert");
|
|
||||||
const syncpipe = require("syncpipe");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const extractId = require("../extract-id");
|
|
||||||
|
|
||||||
module.exports = function scrapeCategory({ session }) {
|
|
||||||
return async function({ data, createItem }) {
|
|
||||||
let response = await session.get(data.url);
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let prmisID = extractId(response.body.toString());
|
|
||||||
|
|
||||||
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
|
|
||||||
assureResponse(response);
|
|
||||||
let listingBuffer = listingResponse.body;
|
|
||||||
|
|
||||||
if (listingBuffer.length > 0) {
|
|
||||||
// This is a category that has a product explorer
|
|
||||||
let listing = JSON.parse(listingBuffer.toString());
|
|
||||||
|
|
||||||
let cellNames = listing.columns.map((column) => {
|
|
||||||
let cellName = (column.identifier != null)
|
|
||||||
? `${column.identifier}_${column.qualifier_identifier}`
|
|
||||||
: `nonstandard:${column.name}:${column.qualifier}`
|
|
||||||
|
|
||||||
createItem({
|
|
||||||
id: `st:column:${cellName}`,
|
|
||||||
tags: [ "st:column" ],
|
|
||||||
data: column
|
|
||||||
});
|
|
||||||
|
|
||||||
return cellName;
|
|
||||||
});
|
|
||||||
|
|
||||||
for (let row of listing.rows) {
|
|
||||||
assert(row.productId != null);
|
|
||||||
|
|
||||||
let cellData = syncpipe(row.cells, [
|
|
||||||
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
|
|
||||||
(_) => Object.fromEntries(_)
|
|
||||||
]);
|
|
||||||
|
|
||||||
createItem({
|
|
||||||
id: `st:product:${row.productId}`,
|
|
||||||
tags: [ "st:product" ],
|
|
||||||
data: {
|
|
||||||
... row,
|
|
||||||
cells: undefined,
|
|
||||||
cellData: cellData
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.warn("Warning: empty response, category does not have product explorer");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,81 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const cheerio = require("cheerio");
|
|
||||||
const url = require("url");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const getUntaggedText = require("../../shared/get-untagged-text");
|
|
||||||
|
|
||||||
module.exports = function scrapeProduct({ session }) {
|
|
||||||
return async function({ data, createItem, updateData, expireDependents }) {
|
|
||||||
if (data.productFolderUrl == null) {
|
|
||||||
throw new Error(`No known product page URL`);
|
|
||||||
}
|
|
||||||
|
|
||||||
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
|
|
||||||
|
|
||||||
let response = await session.get(productPageURL);
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let $ = cheerio.load(response.body);
|
|
||||||
|
|
||||||
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
|
|
||||||
let datasheetURL = (datasheetLink != null)
|
|
||||||
? url.resolve(productPageURL, datasheetLink)
|
|
||||||
: null;
|
|
||||||
|
|
||||||
let resources = $(".st-table--resources")
|
|
||||||
.find("h3").toArray()
|
|
||||||
.map((heading) => {
|
|
||||||
let $heading = $(heading);
|
|
||||||
let sectionID = $heading.attr("id");
|
|
||||||
let sectionTitle = $heading.text().trim();
|
|
||||||
|
|
||||||
let $table = $heading.next("table");
|
|
||||||
|
|
||||||
let items = $table
|
|
||||||
.find("tbody tr").toArray()
|
|
||||||
.map((row) => {
|
|
||||||
let $row = $(row);
|
|
||||||
let $mainView = $row.find(".visible-on-desktop-only");
|
|
||||||
let $link = $mainView.find("a.st-link");
|
|
||||||
|
|
||||||
return {
|
|
||||||
url: url.resolve(productPageURL, $link.attr("href")),
|
|
||||||
documentID: $link.find("span.st-font--bold").text().trim(),
|
|
||||||
description: $link.find("span:not(.st-font--bold)").text().trim(),
|
|
||||||
version: getUntaggedText($link),
|
|
||||||
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
sectionID: sectionID,
|
|
||||||
sectionTitle: sectionTitle,
|
|
||||||
items: items
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
updateData((data) => {
|
|
||||||
return {
|
|
||||||
... data,
|
|
||||||
datasheetLink: datasheetURL,
|
|
||||||
resources: resources
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
expireDependents();
|
|
||||||
|
|
||||||
for (let section of resources) {
|
|
||||||
for (let resource of section.items) {
|
|
||||||
createItem({
|
|
||||||
id: `st:resource:${resource.url}`,
|
|
||||||
tags: (resource.url === datasheetURL)
|
|
||||||
? [ "st:resource", "st:datasheet" ]
|
|
||||||
: [ "st_resource" ],
|
|
||||||
data: { url: resource.url }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,47 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const createDatasheet = require("../../shared/create-datasheet");
|
|
||||||
|
|
||||||
function isEnglish(document) {
|
|
||||||
return /\sen\s*$/i.test(document.description);
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = function tmeNormalizeProduct() {
|
|
||||||
return async function (api) {
|
|
||||||
let { data } = api;
|
|
||||||
|
|
||||||
if (data.itemData.documents.length > 0) {
|
|
||||||
if (typeof data.itemData.documents[0] === "string") {
|
|
||||||
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let manufacturer = data.itemData.manufacturer;
|
|
||||||
let modelName = data.itemData.model;
|
|
||||||
let description = data.itemData.description;
|
|
||||||
let productID = data.itemData.productID;
|
|
||||||
|
|
||||||
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document));
|
|
||||||
|
|
||||||
let bestDocument = (firstEnglish != null)
|
|
||||||
? firstEnglish
|
|
||||||
: data.itemData.documents[0];
|
|
||||||
|
|
||||||
let bestDocumentIsEnglish = isEnglish(bestDocument);
|
|
||||||
|
|
||||||
createDatasheet(api, {
|
|
||||||
priority: (bestDocumentIsEnglish)
|
|
||||||
? 0.6
|
|
||||||
: 0.5,
|
|
||||||
source: "tme",
|
|
||||||
manufacturer: manufacturer,
|
|
||||||
productID: productID,
|
|
||||||
name: modelName,
|
|
||||||
description: description,
|
|
||||||
url: bestDocument.url,
|
|
||||||
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
|
|
||||||
fixCasing: true
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,86 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assert = require("assert");
|
|
||||||
const cheerio = require("cheerio");
|
|
||||||
const syncpipe = require("syncpipe");
|
|
||||||
const url = require("url");
|
|
||||||
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
|
|
||||||
function firstMatch(options) {
|
|
||||||
for (let option of options) {
|
|
||||||
if (option != null && option !== "") {
|
|
||||||
return option;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = function tmeScrapeProduct({ session }) {
|
|
||||||
return async function ({ data, createAlias, updateData, expireDependents }) {
|
|
||||||
let response = await session.get(data.url);
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let $ = cheerio.load(response.body);
|
|
||||||
|
|
||||||
// FIXME: This is currently broken!
|
|
||||||
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [
|
|
||||||
(_) => _.map((header) => {
|
|
||||||
let $header = $(header);
|
|
||||||
|
|
||||||
return [
|
|
||||||
$header.find(".name").text().trim(),
|
|
||||||
$header.find(".value").text().trim()
|
|
||||||
];
|
|
||||||
}),
|
|
||||||
(_) => Object.fromEntries(_)
|
|
||||||
]);
|
|
||||||
|
|
||||||
let descriptionElement = $(".c-pip__description > h2").eq(0);
|
|
||||||
|
|
||||||
let itemData = {
|
|
||||||
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(),
|
|
||||||
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(),
|
|
||||||
model: firstMatch([
|
|
||||||
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
|
|
||||||
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
|
|
||||||
]),
|
|
||||||
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
|
|
||||||
? descriptionElement.text().trim()
|
|
||||||
: null,
|
|
||||||
documents: $("div.c-pip__document > a").toArray()
|
|
||||||
.map((link) => {
|
|
||||||
let relativeLink = $(link).attr("href");
|
|
||||||
|
|
||||||
if (relativeLink != null) {
|
|
||||||
return {
|
|
||||||
description: $(link).text().trim(),
|
|
||||||
url: url.resolve(
|
|
||||||
data.url,
|
|
||||||
relativeLink
|
|
||||||
)
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
// Probably a video popup
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter((item) => {
|
|
||||||
return item != null;
|
|
||||||
}),
|
|
||||||
// TODO: Scrape prices
|
|
||||||
};
|
|
||||||
|
|
||||||
assert(itemData.productID != null);
|
|
||||||
assert(itemData.manufacturer != null);
|
|
||||||
assert(itemData.model != null);
|
|
||||||
|
|
||||||
createAlias({ from: `tme:product:${itemData.productID}` });
|
|
||||||
|
|
||||||
updateData((oldData) => ({
|
|
||||||
... oldData,
|
|
||||||
itemData: itemData
|
|
||||||
}));
|
|
||||||
|
|
||||||
expireDependents();
|
|
||||||
};
|
|
||||||
};
|
|
@ -1,50 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const assert = require("assert");
|
|
||||||
|
|
||||||
const pipe = require("@promistream/pipe");
|
|
||||||
const simpleSink = require("@promistream/simple-sink");
|
|
||||||
const assureResponse = require("../../shared/assure-response");
|
|
||||||
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
|
|
||||||
|
|
||||||
module.exports = function tmeScrapeSitemap({ session }) {
|
|
||||||
return async function ({ data, createItem }) {
|
|
||||||
let response = await session.get(data.url, { stream: true });
|
|
||||||
assureResponse(response);
|
|
||||||
|
|
||||||
let resultCount = 0;
|
|
||||||
|
|
||||||
await pipe([
|
|
||||||
parseSitemapResponse(response),
|
|
||||||
simpleSink((item) => {
|
|
||||||
assert(item.url);
|
|
||||||
|
|
||||||
if (item.type === "sitemap") {
|
|
||||||
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
|
||||||
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
|
|
||||||
createItem({
|
|
||||||
id: `tme:sitemap:${item.url}`,
|
|
||||||
tags: [ "tme:sitemap" ],
|
|
||||||
data: { url: item.url }
|
|
||||||
});
|
|
||||||
|
|
||||||
resultCount += 1;
|
|
||||||
}
|
|
||||||
} else if (item.type === "url") {
|
|
||||||
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
|
|
||||||
createItem({
|
|
||||||
id: `tme:product:${item.url}`,
|
|
||||||
tags: [ "tme:product" ],
|
|
||||||
data: { url: item.url }
|
|
||||||
});
|
|
||||||
|
|
||||||
resultCount += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
]).read();
|
|
||||||
|
|
||||||
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
|
||||||
assert(resultCount > 0);
|
|
||||||
};
|
|
||||||
};
|
|
@ -0,0 +1,3 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
console.log(require("fix-esm").require("execall").toString())
|
@ -0,0 +1,7 @@
|
|||||||
|
- add source: http://www.injoinic.com/product_detail/id/21.html
|
||||||
|
- LCSC: remove placeholder URLs that aren't actually datasheets
|
||||||
|
- rework createDatasheet:
|
||||||
|
- retain alternate options
|
||||||
|
- track language of each entry (when known)
|
||||||
|
- also retain non-datasheet documentation
|
||||||
|
- add source: realtek
|
Loading…
Reference in New Issue