30 changed files with 769 additions and 766 deletions
@ -1,30 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const syncpipe = require("syncpipe"); |
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const surgeon = require("../../shared/surgeon-utils"); |
|||
const uniqueArray = require("../../shared/unique-array"); |
|||
|
|||
module.exports = function findCategories({ session }) { |
|||
return async function({ createItem }) { |
|||
let response = await session.get("https://focuslcds.com/"); |
|||
assureResponse(response); |
|||
|
|||
let urls = syncpipe(null, [ |
|||
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()), |
|||
_ => uniqueArray(_), |
|||
_ => _.filter((relativeURL) => relativeURL !== ""), |
|||
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL)) |
|||
]); |
|||
|
|||
for (let url of urls) { |
|||
createItem({ |
|||
id: `focus-lcds:category:${url}`, |
|||
tags: [ "focus-lcds:category" ], |
|||
data: { url: url } |
|||
}); |
|||
} |
|||
}; |
|||
}; |
@ -1,19 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
module.exports = function normalizeProduct() { |
|||
return async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.8, |
|||
source: "focus-lcds", |
|||
manufacturer: data.itemData.manufacturer ?? "Focus LCDs", |
|||
productID: null, |
|||
name: data.itemData.name, |
|||
description: data.itemData.description, |
|||
url: data.itemData.datasheetURL |
|||
}); |
|||
}; |
|||
}; |
@ -1,33 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const surgeon = require("../../shared/surgeon-utils"); |
|||
|
|||
module.exports = function scrapeCategory({ session }) { |
|||
return async function({ data, createItem }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let body = response.body.toString(); |
|||
|
|||
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body); |
|||
|
|||
if (nextPageURL != null) { |
|||
createItem({ |
|||
id: `focus-lcds:category:${nextPageURL}`, |
|||
tags: [ "focus-lcds:category" ], |
|||
data: { url: nextPageURL } |
|||
}); |
|||
} |
|||
|
|||
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body); |
|||
|
|||
for (let url of items) { |
|||
createItem({ |
|||
id: `focus-lcds:product:${url}`, |
|||
tags: [ "focus-lcds:product" ], |
|||
data: { url: url } |
|||
}); |
|||
} |
|||
}; |
|||
}; |
@ -1,45 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const surgeon = require("../../shared/surgeon-utils"); |
|||
const extractModelNumber = require("../../shared/extract-model-number"); |
|||
|
|||
module.exports = function scrapeProduct({ session }) { |
|||
return async function({ data, updateData, expireDependents }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let body = response.body.toString(); |
|||
|
|||
let itemData = surgeon({ |
|||
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ], |
|||
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`, |
|||
description: [ |
|||
`selectMaybeOne "meta[name='description']" | readAttr content`, |
|||
// Get rid of the keyword spam...
|
|||
(description) => description.split(",")[0] |
|||
], |
|||
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`, |
|||
price: `selectMaybeOne .productView-price .price--withoutTax | text`, |
|||
datasheetURL: [ |
|||
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`, |
|||
(relativeURL) => (!/^\/content\/?$/.test(relativeURL)) |
|||
? url.resolve("https://focuslcds.com/", relativeURL) |
|||
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
|
|||
], |
|||
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, { |
|||
name: `text`, |
|||
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text` |
|||
}] |
|||
}, body); |
|||
|
|||
updateData((oldData) => ({ |
|||
... oldData, |
|||
itemData: itemData |
|||
})); |
|||
|
|||
expireDependents(); |
|||
}; |
|||
}; |
@ -1,42 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
|
|||
module.exports = function lcscFindCategories(state) { |
|||
let { session } = state; |
|||
|
|||
return async function ({ createItem }) { |
|||
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category"); |
|||
|
|||
assureResponse(response); |
|||
assert(response.body.length > 0); |
|||
assert(response.statusCode === 200); |
|||
|
|||
function processCategoryEntries(categories) { |
|||
for (let category of categories) { |
|||
let productCount = category.productNum; |
|||
let pageCount = Math.ceil(productCount / 500); |
|||
|
|||
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
|
|||
for (let i = 1; i <= pageCount; i++) { |
|||
createItem({ |
|||
id: `lcsc:category:${category.catalogId}:page-${i}`, |
|||
tags: [ "lcsc:category" ], |
|||
data: { |
|||
... category, |
|||
pageNumber: i |
|||
} |
|||
}); |
|||
} |
|||
|
|||
if (category.childCatelogs != null) { |
|||
processCategoryEntries(category.childCatelogs); |
|||
} |
|||
} |
|||
} |
|||
|
|||
processCategoryEntries(response.body); |
|||
}; |
|||
}; |
@ -1,19 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
module.exports = function lcscNormalizeProduct() { |
|||
return async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.4, |
|||
source: "lcsc", |
|||
manufacturer: data.brandNameEn, |
|||
productID: data.productCode, |
|||
name: data.productModel, |
|||
description: data.productIntroEn, |
|||
url: data.pdfUrl |
|||
}); |
|||
}; |
|||
}; |
@ -1,37 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
|
|||
// TODO: Validate response formats with validatem instead
|
|||
|
|||
module.exports = function lcscScrapeCategory(state) { |
|||
let { session } = state; |
|||
|
|||
return async function ({ data, createItem, deleteItem, updateData }) { |
|||
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, { |
|||
catalogIdList: [ data.catalogId ], |
|||
currentPage: data.pageNumber, |
|||
pageSize: 500, |
|||
paramNameValueMap: {} |
|||
}); |
|||
|
|||
assureResponse(response); |
|||
assert(response.statusCode === 200); |
|||
assert(response.body.productList != null); // Missing from stale queued requests?
|
|||
assert(response.body.productList.length > 0); |
|||
|
|||
for (let item of response.body.productList) { |
|||
createItem({ |
|||
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
|
|||
id: `lcsc:product:${item.productCode}`, |
|||
tags: [ "lcsc:product" ], |
|||
data: item |
|||
}); |
|||
} |
|||
|
|||
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
|
|||
deleteItem(); |
|||
}; |
|||
}; |
@ -0,0 +1,21 @@ |
|||
"use strict"; |
|||
|
|||
const mergeByTemplate = require("merge-by-template"); |
|||
|
|||
function noOverride(a, b) { |
|||
if (a != null && b != null) { |
|||
throw new Error(`Property cannot be overridden`); |
|||
} |
|||
} |
|||
|
|||
let merge = mergeByTemplate.createMerger({ |
|||
backend: noOverride, |
|||
database: noOverride, |
|||
seed: [], |
|||
tags: mergeByTemplate.anyProperty([]), |
|||
tasks: {} |
|||
}); |
|||
|
|||
module.exports = function mergeSources(base, sources) { |
|||
return merge([ base, ... sources ]); |
|||
}; |
@ -0,0 +1,137 @@ |
|||
"use strict"; |
|||
|
|||
const syncpipe = require("syncpipe"); |
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const surgeon = require("../../shared/surgeon-utils"); |
|||
const uniqueArray = require("../../shared/unique-array"); |
|||
const extractModelNumber = require("../../shared/extract-model-number"); |
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
// Focus LCDs
|
|||
|
|||
module.exports = function ({ session }) { |
|||
return { |
|||
seed: [{ |
|||
id: "focus-lcds:home", |
|||
tags: [ "focus-lcds:home" ], |
|||
data: {} |
|||
}], |
|||
tags: { |
|||
"focus-lcds:home": [ "focus-lcds:findCategories" ], |
|||
"focus-lcds:category": [ "focus-lcds:scrapeCategory" ], |
|||
"focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ], |
|||
}, |
|||
tasks: { |
|||
"focus-lcds:findCategories": { |
|||
ttl: "60d", |
|||
run: async function({ createItem }) { |
|||
let response = await session.get("https://focuslcds.com/"); |
|||
assureResponse(response); |
|||
|
|||
let urls = syncpipe(null, [ |
|||
_ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()), |
|||
_ => uniqueArray(_), |
|||
_ => _.filter((relativeURL) => relativeURL !== ""), |
|||
_ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL)) |
|||
]); |
|||
|
|||
for (let url of urls) { |
|||
createItem({ |
|||
id: `focus-lcds:category:${url}`, |
|||
tags: [ "focus-lcds:category" ], |
|||
data: { url: url } |
|||
}); |
|||
} |
|||
} |
|||
}, |
|||
"focus-lcds:scrapeCategory": { |
|||
ttl: "15d", |
|||
taskInterval: "1m", |
|||
run: async function({ data, createItem }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let body = response.body.toString(); |
|||
|
|||
let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body); |
|||
|
|||
if (nextPageURL != null) { |
|||
createItem({ |
|||
id: `focus-lcds:category:${nextPageURL}`, |
|||
tags: [ "focus-lcds:category" ], |
|||
data: { url: nextPageURL } |
|||
}); |
|||
} |
|||
|
|||
let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body); |
|||
|
|||
for (let url of items) { |
|||
createItem({ |
|||
id: `focus-lcds:product:${url}`, |
|||
tags: [ "focus-lcds:product" ], |
|||
data: { url: url } |
|||
}); |
|||
} |
|||
} |
|||
}, |
|||
"focus-lcds:scrapeProduct": { |
|||
ttl: "15d", |
|||
taskInterval: "5s", |
|||
run: async function({ data, updateData, expireDependents }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let body = response.body.toString(); |
|||
|
|||
let itemData = surgeon({ |
|||
name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ], |
|||
manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`, |
|||
description: [ |
|||
`selectMaybeOne "meta[name='description']" | readAttr content`, |
|||
// Get rid of the keyword spam...
|
|||
(description) => description.split(",")[0] |
|||
], |
|||
image: `selectMaybeOne "meta[property='og:image']" | readAttr content`, |
|||
price: `selectMaybeOne .productView-price .price--withoutTax | text`, |
|||
datasheetURL: [ |
|||
`selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`, |
|||
(relativeURL) => (!/^\/content\/?$/.test(relativeURL)) |
|||
? url.resolve("https://focuslcds.com/", relativeURL) |
|||
: null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product
|
|||
], |
|||
technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, { |
|||
name: `text`, |
|||
value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text` |
|||
}] |
|||
}, body); |
|||
|
|||
updateData((oldData) => ({ |
|||
... oldData, |
|||
itemData: itemData |
|||
})); |
|||
|
|||
expireDependents(); |
|||
} |
|||
}, |
|||
"focus-lcds:normalizeProduct": { |
|||
dependsOn: [ "focus-lcds:scrapeProduct" ], |
|||
parallelTasks: 50, |
|||
run: async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.8, |
|||
source: "focus-lcds", |
|||
manufacturer: data.itemData.manufacturer ?? "Focus LCDs", |
|||
productID: null, |
|||
name: data.itemData.name, |
|||
description: data.itemData.description, |
|||
url: data.itemData.datasheetURL |
|||
}); |
|||
} |
|||
}, |
|||
} |
|||
}; |
|||
}; |
@ -0,0 +1,108 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
// LCSC
|
|||
// TODO: Validate response formats with validatem instead
|
|||
|
|||
module.exports = function ({ session }) { |
|||
return { |
|||
seed: [{ |
|||
id: "lcsc:home", |
|||
tags: [ "lcsc:home" ], |
|||
data: {} |
|||
}], |
|||
tags: { |
|||
"lcsc:home": [ "lcsc:findCategories" ], |
|||
"lcsc:category": [ "lcsc:scrapeCategory" ], |
|||
"lcsc:product": [ "lcsc:normalizeProduct" ], |
|||
}, |
|||
tasks: { |
|||
"lcsc:findCategories": { |
|||
ttl: "30d", |
|||
version: "1", |
|||
run: async function ({ storeItem }) { |
|||
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category"); |
|||
|
|||
assureResponse(response); |
|||
assert(response.body.length > 0); |
|||
assert(response.statusCode === 200); |
|||
|
|||
function processCategoryEntries(categories) { |
|||
for (let category of categories) { |
|||
let productCount = category.productNum; |
|||
let pageCount = Math.ceil(productCount / 500); |
|||
|
|||
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
|
|||
for (let i = 1; i <= pageCount; i++) { |
|||
storeItem({ |
|||
id: `lcsc:category:${category.catalogId}:page-${i}`, |
|||
tags: [ "lcsc:category" ], |
|||
data: { |
|||
... category, |
|||
pageNumber: i |
|||
} |
|||
}); |
|||
} |
|||
|
|||
if (category.childCatelogs != null) { |
|||
processCategoryEntries(category.childCatelogs); |
|||
} |
|||
} |
|||
} |
|||
|
|||
processCategoryEntries(response.body); |
|||
} |
|||
}, |
|||
"lcsc:scrapeCategory": { |
|||
ttl: "30d", |
|||
taskInterval: "1m", |
|||
run: async function ({ data, storeItem, deleteItem }) { |
|||
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, { |
|||
catalogIdList: [ data.catalogId ], |
|||
currentPage: data.pageNumber, |
|||
pageSize: 500, |
|||
paramNameValueMap: {} |
|||
}); |
|||
|
|||
assureResponse(response); |
|||
assert(response.statusCode === 200); |
|||
assert(response.body.productList != null); // Missing from stale queued requests?
|
|||
assert(response.body.productList.length > 0); |
|||
|
|||
for (let item of response.body.productList) { |
|||
storeItem({ |
|||
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
|
|||
id: `lcsc:product:${item.productCode}`, |
|||
tags: [ "lcsc:product" ], |
|||
data: item |
|||
}); |
|||
} |
|||
|
|||
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
|
|||
deleteItem(); |
|||
} |
|||
}, |
|||
"lcsc:normalizeProduct": { |
|||
version: "7", |
|||
parallelTasks: 50, |
|||
run: async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.4, |
|||
source: "lcsc", |
|||
manufacturer: data.brandNameEn, |
|||
productID: data.productCode, |
|||
name: data.productModel, |
|||
description: data.productIntroEn, |
|||
url: data.pdfUrl |
|||
}); |
|||
} |
|||
}, |
|||
} |
|||
}; |
|||
}; |
@ -0,0 +1,211 @@ |
|||
"use strict"; |
|||
|
|||
const cheerio = require("cheerio"); |
|||
const url = require("url"); |
|||
const assert = require("assert"); |
|||
const syncpipe = require("syncpipe"); |
|||
const htmlEntities = require("html-entities"); |
|||
|
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const getUntaggedText = require("../../shared/get-untagged-text"); |
|||
|
|||
// ST Microelectronics
|
|||
|
|||
function extractID(string) { |
|||
// Quick-and-dirty category ID parsing from category pages
|
|||
let match = /"prmisID":"([^"]+)"/.exec(string); |
|||
|
|||
if (match != null) { |
|||
return match[1]; |
|||
} else { |
|||
throw new Error(`ST: prmis ID expected but not found`); |
|||
} |
|||
} |
|||
|
|||
module.exports = function ({ session }) { |
|||
return { |
|||
seed: [{ |
|||
id: "st:home", |
|||
tags: [ "st:home" ], |
|||
data: {} |
|||
}], |
|||
tags: { |
|||
"st:home": [ "st:findCategories" ], |
|||
"st:category": [ "st:scrapeCategory" ], |
|||
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ], |
|||
}, |
|||
tasks: { |
|||
"st:findCategories": { |
|||
ttl: "15d", |
|||
run: async function ({ createItem }) { |
|||
let response = await session.get("https://www.st.com/content/st_com/en.html"); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)") |
|||
.toArray() |
|||
.map((element) => $(element).attr("href")) |
|||
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL)); |
|||
|
|||
for (let link of links) { |
|||
createItem({ |
|||
id: `st:category:${link}`, |
|||
tags: [ "st:category" ], |
|||
data: { url: link } |
|||
}); |
|||
} |
|||
} |
|||
}, |
|||
"st:scrapeCategory": { |
|||
ttl: "1d", |
|||
taskInterval: "60s", |
|||
version: "2", |
|||
run: async function({ data, createItem }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let prmisID = extractID(response.body.toString()); |
|||
|
|||
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true }); |
|||
assureResponse(response); |
|||
let listingBuffer = listingResponse.body; |
|||
|
|||
if (listingBuffer.length > 0) { |
|||
// This is a category that has a product explorer
|
|||
let listing = JSON.parse(listingBuffer.toString()); |
|||
|
|||
let cellNames = listing.columns.map((column) => { |
|||
let cellName = (column.identifier != null) |
|||
? `${column.identifier}_${column.qualifier_identifier}` |
|||
: `nonstandard:${column.name}:${column.qualifier}` |
|||
|
|||
createItem({ |
|||
id: `st:column:${cellName}`, |
|||
tags: [ "st:column" ], |
|||
data: column |
|||
}); |
|||
|
|||
return cellName; |
|||
}); |
|||
|
|||
for (let row of listing.rows) { |
|||
assert(row.productId != null); |
|||
|
|||
let cellData = syncpipe(row.cells, [ |
|||
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]), |
|||
(_) => Object.fromEntries(_) |
|||
]); |
|||
|
|||
createItem({ |
|||
id: `st:product:${row.productId}`, |
|||
tags: [ "st:product" ], |
|||
data: { |
|||
... row, |
|||
cells: undefined, |
|||
cellData: cellData |
|||
} |
|||
}); |
|||
} |
|||
} else { |
|||
console.warn("Warning: empty response, category does not have product explorer"); |
|||
} |
|||
} |
|||
}, |
|||
"st:scrapeProduct": { |
|||
ttl: "15d", |
|||
taskInterval: "5s", |
|||
run: async function({ data, createItem, updateData, expireDependents }) { |
|||
if (data.productFolderUrl == null) { |
|||
throw new Error(`No known product page URL`); |
|||
} |
|||
|
|||
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl); |
|||
|
|||
let response = await session.get(productPageURL); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
let datasheetLink = $("a[data-js='datasheetLink']").attr("href"); |
|||
let datasheetURL = (datasheetLink != null) |
|||
? url.resolve(productPageURL, datasheetLink) |
|||
: null; |
|||
|
|||
let resources = $(".st-table--resources") |
|||
.find("h3").toArray() |
|||
.map((heading) => { |
|||
let $heading = $(heading); |
|||
let sectionID = $heading.attr("id"); |
|||
let sectionTitle = $heading.text().trim(); |
|||
|
|||
let $table = $heading.next("table"); |
|||
|
|||
let items = $table |
|||
.find("tbody tr").toArray() |
|||
.map((row) => { |
|||
let $row = $(row); |
|||
let $mainView = $row.find(".visible-on-desktop-only"); |
|||
let $link = $mainView.find("a.st-link"); |
|||
|
|||
return { |
|||
url: url.resolve(productPageURL, $link.attr("href")), |
|||
documentID: $link.find("span.st-font--bold").text().trim(), |
|||
description: $link.find("span:not(.st-font--bold)").text().trim(), |
|||
version: getUntaggedText($link), |
|||
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim() |
|||
}; |
|||
}); |
|||
|
|||
return { |
|||
sectionID: sectionID, |
|||
sectionTitle: sectionTitle, |
|||
items: items |
|||
}; |
|||
}); |
|||
|
|||
updateData((data) => { |
|||
return { |
|||
... data, |
|||
datasheetLink: datasheetURL, |
|||
resources: resources |
|||
}; |
|||
}); |
|||
|
|||
expireDependents(); |
|||
|
|||
for (let section of resources) { |
|||
for (let resource of section.items) { |
|||
createItem({ |
|||
id: `st:resource:${resource.url}`, |
|||
tags: (resource.url === datasheetURL) |
|||
? [ "st:resource", "st:datasheet" ] |
|||
: [ "st_resource" ], |
|||
data: { url: resource.url } |
|||
}); |
|||
} |
|||
} |
|||
} |
|||
}, |
|||
"st:normalizeProduct": { |
|||
dependsOn: [ "st:scrapeProduct" ], |
|||
version: "8", |
|||
parallelTasks: 50, |
|||
run: async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.8, |
|||
source: "st", |
|||
manufacturer: "STMicroelectronics", |
|||
productID: data.productId, |
|||
name: data.cellData["XJE010_VT-007"], |
|||
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]), |
|||
url: data.datasheetLink |
|||
}); |
|||
} |
|||
}, |
|||
} |
|||
}; |
|||
}; |
@ -0,0 +1,202 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
const cheerio = require("cheerio"); |
|||
const syncpipe = require("syncpipe"); |
|||
const url = require("url"); |
|||
|
|||
const pipe = require("@promistream/pipe"); |
|||
const simpleSink = require("@promistream/simple-sink"); |
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const parseSitemapResponse = require("../../shared/parse-sitemap-response"); |
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
// TME.eu
|
|||
|
|||
function firstMatch(options) { |
|||
for (let option of options) { |
|||
if (option != null && option !== "") { |
|||
return option; |
|||
} |
|||
} |
|||
} |
|||
|
|||
module.exports = function ({ session }) { |
|||
return { |
|||
seed: [{ |
|||
id: "tme:sitemap:index", |
|||
tags: [ "tme:sitemap" ], |
|||
data: { url: "https://www.tme.eu/en/sitemap.xml" } |
|||
// TODO: Delete derived sitemap entries
|
|||
}], |
|||
tags: { |
|||
"tme:sitemap": [ "tme:scrapeSitemap" ], |
|||
"tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ], |
|||
}, |
|||
tasks: { |
|||
"tme:scrapeSitemap": { |
|||
ttl: "3d", |
|||
taskInterval: "30s", |
|||
run: async function ({ data, createItem, deleteItem }) { |
|||
let response = await session.get(data.url, { stream: true }); |
|||
assureResponse(response); |
|||
|
|||
let resultCount = 0; |
|||
|
|||
await pipe([ |
|||
parseSitemapResponse(response), |
|||
simpleSink((item) => { |
|||
assert(item.url); |
|||
|
|||
if (item.type === "sitemap") { |
|||
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
|||
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) { |
|||
createItem({ |
|||
id: `tme:sitemap:${item.url}`, |
|||
tags: [ "tme:sitemap" ], |
|||
data: { url: item.url } |
|||
}); |
|||
|
|||
resultCount += 1; |
|||
} |
|||
} else if (item.type === "url") { |
|||
if (item.url.startsWith("https://www.tme.eu/en/details/")) { |
|||
createItem({ |
|||
id: `tme:product:${item.url}`, |
|||
tags: [ "tme:product" ], |
|||
data: { url: item.url } |
|||
}); |
|||
|
|||
resultCount += 1; |
|||
} |
|||
} |
|||
}) |
|||
]).read(); |
|||
|
|||
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
|||
assert(resultCount > 0); |
|||
|
|||
// FIXME: Do this here? Or is there a reason *not* to delete the sitemap entry?
|
|||
// deleteItem();
|
|||
} |
|||
}, |
|||
"tme:scrapeProduct": { |
|||
ttl: "60d", |
|||
taskInterval: "500ms", |
|||
run: async function ({ data, createAlias, updateData, expireDependents }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
// FIXME: This is currently broken!
|
|||
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [ |
|||
(_) => _.map((header) => { |
|||
let $header = $(header); |
|||
|
|||
return [ |
|||
$header.find(".name").text().trim(), |
|||
$header.find(".value").text().trim() |
|||
]; |
|||
}), |
|||
(_) => Object.fromEntries(_) |
|||
]); |
|||
|
|||
let descriptionElement = $(".c-pip__description > h2").eq(0); |
|||
|
|||
let itemData = { |
|||
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), |
|||
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(), |
|||
model: firstMatch([ |
|||
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
|
|||
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
|
|||
]), |
|||
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
|
|||
? descriptionElement.text().trim() |
|||
: null, |
|||
documents: $("div.c-pip__document > a").toArray() |
|||
.map((link) => { |
|||
let relativeLink = $(link).attr("href"); |
|||
|
|||
if (relativeLink != null) { |
|||
return { |
|||
description: $(link).text().trim(), |
|||
url: url.resolve( |
|||
data.url, |
|||
relativeLink |
|||
) |
|||
}; |
|||
} else { |
|||
// Probably a video popup
|
|||
return null; |
|||
} |
|||
}) |
|||
.filter((item) => { |
|||
return item != null; |
|||
}), |
|||
// TODO: Scrape prices
|
|||
}; |
|||
|
|||
assert(itemData.productID != null); |
|||
assert(itemData.manufacturer != null); |
|||
assert(itemData.model != null); |
|||
|
|||
createAlias({ from: `tme:product:${itemData.productID}` }); |
|||
|
|||
updateData((oldData) => ({ |
|||
... oldData, |
|||
itemData: itemData |
|||
})); |
|||
|
|||
expireDependents(); |
|||
} |
|||
}, |
|||
"tme:normalizeProduct": { |
|||
dependsOn: [ "tme:scrapeProduct" ], |
|||
version: "5", |
|||
parallelTasks: 50, |
|||
run: async function (api) { |
|||
let { data } = api; |
|||
|
|||
function isEnglish(document) { |
|||
return /\sen\s*$/i.test(document.description); |
|||
} |
|||
|
|||
if (data.itemData.documents.length > 0) { |
|||
if (typeof data.itemData.documents[0] === "string") { |
|||
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
|
|||
return; |
|||
} |
|||
|
|||
let manufacturer = data.itemData.manufacturer; |
|||
let modelName = data.itemData.model; |
|||
let description = data.itemData.description; |
|||
let productID = data.itemData.productID; |
|||
|
|||
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document)); |
|||
|
|||
let bestDocument = (firstEnglish != null) |
|||
? firstEnglish |
|||
: data.itemData.documents[0]; |
|||
|
|||
let bestDocumentIsEnglish = isEnglish(bestDocument); |
|||
|
|||
createDatasheet(api, { |
|||
priority: (bestDocumentIsEnglish) |
|||
? 0.6 |
|||
: 0.5, |
|||
source: "tme", |
|||
manufacturer: manufacturer, |
|||
productID: productID, |
|||
name: modelName, |
|||
description: description, |
|||
url: bestDocument.url, |
|||
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
|
|||
fixCasing: true |
|||
}); |
|||
} |
|||
} |
|||
}, |
|||
} |
|||
}; |
|||
}; |
@ -1,12 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
module.exports = function extractID(string) { |
|||
// Quick-and-dirty category ID parsing from category pages
|
|||
let match = /"prmisID":"([^"]+)"/.exec(string); |
|||
|
|||
if (match != null) { |
|||
return match[1]; |
|||
} else { |
|||
throw new Error(`ST: prmis ID expected but not found`); |
|||
} |
|||
}; |
@ -1,28 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const cheerio = require("cheerio"); |
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
|
|||
module.exports = function findCategories({ session }) { |
|||
return async function ({ createItem }) { |
|||
let response = await session.get("https://www.st.com/content/st_com/en.html"); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)") |
|||
.toArray() |
|||
.map((element) => $(element).attr("href")) |
|||
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL)); |
|||
|
|||
for (let link of links) { |
|||
createItem({ |
|||
id: `st:category:${link}`, |
|||
tags: [ "st:category" ], |
|||
data: { url: link } |
|||
}); |
|||
} |
|||
}; |
|||
}; |
@ -1,20 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const htmlEntities = require("html-entities"); |
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
module.exports = function normalizeProduct() { |
|||
return async function (api) { |
|||
let { data } = api; |
|||
|
|||
createDatasheet(api, { |
|||
priority: 0.8, |
|||
source: "st", |
|||
manufacturer: "STMicroelectronics", |
|||
productID: data.productId, |
|||
name: data.cellData["XJE010_VT-007"], |
|||
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]), |
|||
url: data.datasheetLink |
|||
}); |
|||
}; |
|||
}; |
@ -1,60 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
const syncpipe = require("syncpipe"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const extractId = require("../extract-id"); |
|||
|
|||
module.exports = function scrapeCategory({ session }) { |
|||
return async function({ data, createItem }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let prmisID = extractId(response.body.toString()); |
|||
|
|||
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true }); |
|||
assureResponse(response); |
|||
let listingBuffer = listingResponse.body; |
|||
|
|||
if (listingBuffer.length > 0) { |
|||
// This is a category that has a product explorer
|
|||
let listing = JSON.parse(listingBuffer.toString()); |
|||
|
|||
let cellNames = listing.columns.map((column) => { |
|||
let cellName = (column.identifier != null) |
|||
? `${column.identifier}_${column.qualifier_identifier}` |
|||
: `nonstandard:${column.name}:${column.qualifier}` |
|||
|
|||
createItem({ |
|||
id: `st:column:${cellName}`, |
|||
tags: [ "st:column" ], |
|||
data: column |
|||
}); |
|||
|
|||
return cellName; |
|||
}); |
|||
|
|||
for (let row of listing.rows) { |
|||
assert(row.productId != null); |
|||
|
|||
let cellData = syncpipe(row.cells, [ |
|||
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]), |
|||
(_) => Object.fromEntries(_) |
|||
]); |
|||
|
|||
createItem({ |
|||
id: `st:product:${row.productId}`, |
|||
tags: [ "st:product" ], |
|||
data: { |
|||
... row, |
|||
cells: undefined, |
|||
cellData: cellData |
|||
} |
|||
}); |
|||
} |
|||
} else { |
|||
console.warn("Warning: empty response, category does not have product explorer"); |
|||
} |
|||
}; |
|||
}; |
@ -1,81 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const cheerio = require("cheerio"); |
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const getUntaggedText = require("../../shared/get-untagged-text"); |
|||
|
|||
module.exports = function scrapeProduct({ session }) { |
|||
return async function({ data, createItem, updateData, expireDependents }) { |
|||
if (data.productFolderUrl == null) { |
|||
throw new Error(`No known product page URL`); |
|||
} |
|||
|
|||
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl); |
|||
|
|||
let response = await session.get(productPageURL); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
let datasheetLink = $("a[data-js='datasheetLink']").attr("href"); |
|||
let datasheetURL = (datasheetLink != null) |
|||
? url.resolve(productPageURL, datasheetLink) |
|||
: null; |
|||
|
|||
let resources = $(".st-table--resources") |
|||
.find("h3").toArray() |
|||
.map((heading) => { |
|||
let $heading = $(heading); |
|||
let sectionID = $heading.attr("id"); |
|||
let sectionTitle = $heading.text().trim(); |
|||
|
|||
let $table = $heading.next("table"); |
|||
|
|||
let items = $table |
|||
.find("tbody tr").toArray() |
|||
.map((row) => { |
|||
let $row = $(row); |
|||
let $mainView = $row.find(".visible-on-desktop-only"); |
|||
let $link = $mainView.find("a.st-link"); |
|||
|
|||
return { |
|||
url: url.resolve(productPageURL, $link.attr("href")), |
|||
documentID: $link.find("span.st-font--bold").text().trim(), |
|||
description: $link.find("span:not(.st-font--bold)").text().trim(), |
|||
version: getUntaggedText($link), |
|||
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim() |
|||
}; |
|||
}); |
|||
|
|||
return { |
|||
sectionID: sectionID, |
|||
sectionTitle: sectionTitle, |
|||
items: items |
|||
}; |
|||
}); |
|||
|
|||
updateData((data) => { |
|||
return { |
|||
... data, |
|||
datasheetLink: datasheetURL, |
|||
resources: resources |
|||
}; |
|||
}); |
|||
|
|||
expireDependents(); |
|||
|
|||
for (let section of resources) { |
|||
for (let resource of section.items) { |
|||
createItem({ |
|||
id: `st:resource:${resource.url}`, |
|||
tags: (resource.url === datasheetURL) |
|||
? [ "st:resource", "st:datasheet" ] |
|||
: [ "st_resource" ], |
|||
data: { url: resource.url } |
|||
}); |
|||
} |
|||
} |
|||
}; |
|||
}; |
@ -1,47 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const createDatasheet = require("../../shared/create-datasheet"); |
|||
|
|||
function isEnglish(document) { |
|||
return /\sen\s*$/i.test(document.description); |
|||
} |
|||
|
|||
module.exports = function tmeNormalizeProduct() { |
|||
return async function (api) { |
|||
let { data } = api; |
|||
|
|||
if (data.itemData.documents.length > 0) { |
|||
if (typeof data.itemData.documents[0] === "string") { |
|||
// Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure
|
|||
return; |
|||
} |
|||
|
|||
let manufacturer = data.itemData.manufacturer; |
|||
let modelName = data.itemData.model; |
|||
let description = data.itemData.description; |
|||
let productID = data.itemData.productID; |
|||
|
|||
let firstEnglish = data.itemData.documents.find((document) => isEnglish(document)); |
|||
|
|||
let bestDocument = (firstEnglish != null) |
|||
? firstEnglish |
|||
: data.itemData.documents[0]; |
|||
|
|||
let bestDocumentIsEnglish = isEnglish(bestDocument); |
|||
|
|||
createDatasheet(api, { |
|||
priority: (bestDocumentIsEnglish) |
|||
? 0.6 |
|||
: 0.5, |
|||
source: "tme", |
|||
manufacturer: manufacturer, |
|||
productID: productID, |
|||
name: modelName, |
|||
description: description, |
|||
url: bestDocument.url, |
|||
// NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later.
|
|||
fixCasing: true |
|||
}); |
|||
} |
|||
}; |
|||
}; |
@ -1,86 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
const cheerio = require("cheerio"); |
|||
const syncpipe = require("syncpipe"); |
|||
const url = require("url"); |
|||
|
|||
const assureResponse = require("../../shared/assure-response"); |
|||
|
|||
function firstMatch(options) { |
|||
for (let option of options) { |
|||
if (option != null && option !== "") { |
|||
return option; |
|||
} |
|||
} |
|||
} |
|||
|
|||
module.exports = function tmeScrapeProduct({ session }) { |
|||
return async function ({ data, createAlias, updateData, expireDependents }) { |
|||
let response = await session.get(data.url); |
|||
assureResponse(response); |
|||
|
|||
let $ = cheerio.load(response.body); |
|||
|
|||
// FIXME: This is currently broken!
|
|||
let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [ |
|||
(_) => _.map((header) => { |
|||
let $header = $(header); |
|||
|
|||
return [ |
|||
$header.find(".name").text().trim(), |
|||
$header.find(".value").text().trim() |
|||
]; |
|||
}), |
|||
(_) => Object.fromEntries(_) |
|||
]); |
|||
|
|||
let descriptionElement = $(".c-pip__description > h2").eq(0); |
|||
|
|||
let itemData = { |
|||
productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), |
|||
manufacturer: $("a.pip__product-header-title").eq(0).text().trim(), |
|||
model: firstMatch([ |
|||
$("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number
|
|||
$("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol
|
|||
]), |
|||
description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element
|
|||
? descriptionElement.text().trim() |
|||
: null, |
|||
documents: $("div.c-pip__document > a").toArray() |
|||
.map((link) => { |
|||
let relativeLink = $(link).attr("href"); |
|||
|
|||
if (relativeLink != null) { |
|||
return { |
|||
description: $(link).text().trim(), |
|||
url: url.resolve( |
|||
data.url, |
|||
relativeLink |
|||
) |
|||
}; |
|||
} else { |
|||
// Probably a video popup
|
|||
return null; |
|||
} |
|||
}) |
|||
.filter((item) => { |
|||
return item != null; |
|||
}), |
|||
// TODO: Scrape prices
|
|||
}; |
|||
|
|||
assert(itemData.productID != null); |
|||
assert(itemData.manufacturer != null); |
|||
assert(itemData.model != null); |
|||
|
|||
createAlias({ from: `tme:product:${itemData.productID}` }); |
|||
|
|||
updateData((oldData) => ({ |
|||
... oldData, |
|||
itemData: itemData |
|||
})); |
|||
|
|||
expireDependents(); |
|||
}; |
|||
}; |
@ -1,50 +0,0 @@ |
|||
"use strict"; |
|||
|
|||
const assert = require("assert"); |
|||
|
|||
const pipe = require("@promistream/pipe"); |
|||
const simpleSink = require("@promistream/simple-sink"); |
|||
const assureResponse = require("../../shared/assure-response"); |
|||
const parseSitemapResponse = require("../../shared/parse-sitemap-response"); |
|||
|
|||
module.exports = function tmeScrapeSitemap({ session }) { |
|||
return async function ({ data, createItem }) { |
|||
let response = await session.get(data.url, { stream: true }); |
|||
assureResponse(response); |
|||
|
|||
let resultCount = 0; |
|||
|
|||
await pipe([ |
|||
parseSitemapResponse(response), |
|||
simpleSink((item) => { |
|||
assert(item.url); |
|||
|
|||
if (item.type === "sitemap") { |
|||
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
|||
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) { |
|||
createItem({ |
|||
id: `tme:sitemap:${item.url}`, |
|||
tags: [ "tme:sitemap" ], |
|||
data: { url: item.url } |
|||
}); |
|||
|
|||
resultCount += 1; |
|||
} |
|||
} else if (item.type === "url") { |
|||
if (item.url.startsWith("https://www.tme.eu/en/details/")) { |
|||
createItem({ |
|||
id: `tme:product:${item.url}`, |
|||
tags: [ "tme:product" ], |
|||
data: { url: item.url } |
|||
}); |
|||
|
|||
resultCount += 1; |
|||
} |
|||
} |
|||
}) |
|||
]).read(); |
|||
|
|||
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
|||
assert(resultCount > 0); |
|||
}; |
|||
}; |
@ -0,0 +1,3 @@ |
|||
"use strict"; |
|||
|
|||
console.log(require("fix-esm").require("execall").toString()) |
@ -0,0 +1,7 @@ |
|||