From cc568ab80a566062616aae2618934e016bd33989 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sat, 27 Mar 2021 20:03:49 +0100 Subject: [PATCH] Add LCSC scraper, add priority logic for datasheet entry normalization --- index.js | 45 ++++++++++++++---- lib/lcsc/get-csrf-token.js | 19 ++++++++ lib/lcsc/manufacturer-map.js | 37 +++++++++++++++ lib/lcsc/task/find-categories.js | 60 ++++++++++++++++++++++++ lib/lcsc/task/normalize-product.js | 43 ++++++++++++++++++ lib/lcsc/task/scrape-category.js | 73 ++++++++++++++++++++++++++++++ lib/lcsc/with-csrf-token.js | 33 ++++++++++++++ lib/shared/match-or-fail.js | 11 +++++ lib/shared/normalize-string.js | 15 ++++++ lib/shared/pick-best-option.js | 14 ++++++ lib/st/task/normalize-product.js | 24 ++++++---- package.json | 2 + yarn.lock | 10 ++++ 13 files changed, 368 insertions(+), 18 deletions(-) create mode 100644 lib/lcsc/get-csrf-token.js create mode 100644 lib/lcsc/manufacturer-map.js create mode 100644 lib/lcsc/task/find-categories.js create mode 100644 lib/lcsc/task/normalize-product.js create mode 100644 lib/lcsc/task/scrape-category.js create mode 100644 lib/lcsc/with-csrf-token.js create mode 100644 lib/shared/match-or-fail.js create mode 100644 lib/shared/normalize-string.js create mode 100644 lib/shared/pick-best-option.js diff --git a/index.js b/index.js index 4ab1dfd..b1c5395 100644 --- a/index.js +++ b/index.js @@ -6,14 +6,17 @@ const stScrapeCategory = require("./lib/st/task/scrape-category"); const stFindCategories = require("./lib/st/task/find-categories"); const stScrapeProduct = require("./lib/st/task/scrape-product"); const stNormalizeProduct = require("./lib/st/task/normalize-product"); +const lcscFindCategories = require("./lib/lcsc/task/find-categories"); +const lcscScrapeCategory = require("./lib/lcsc/task/scrape-category"); +const lcscNormalizeProduct = require("./lib/lcsc/task/normalize-product"); -let session = bhttp.session({ - headers: { - "user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)" - } -}); - -let state = { session }; +let state = { + session: bhttp.session({ + headers: { + "user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)" + } + }) +}; module.exports = { database: { @@ -24,11 +27,18 @@ module.exports = { id: "st:home", tags: [ "st:home" ], data: {} + }, { + id: "lcsc:home", + tags: [ "lcsc:home" ], + data: {} }], tags: { "st:home": [ "st:findCategories" ], "st:category": [ "st:scrapeCategory" ], - "st:product": [ "st:scrapeProduct", "st:normalizeProduct" ] + "st:product": [ "st:scrapeProduct", "st:normalizeProduct" ], + "lcsc:home": [ "lcsc:findCategories" ], + "lcsc:category": [ "lcsc:scrapeCategory" ], + "lcsc:product": [ "lcsc:normalizeProduct" ] }, tasks: { "st:findCategories": { @@ -48,8 +58,23 @@ module.exports = { }, "st:normalizeProduct": { dependsOn: [ "st:scrapeProduct" ], - version: "3", + version: "4", run: stNormalizeProduct(state) - } + }, + "lcsc:findCategories": { + ttl: "30d", + taskVersion: "1", + run: lcscFindCategories(state) + }, + "lcsc:scrapeCategory": { + ttl: "30d", + taskInterval: "1m", + run: lcscScrapeCategory(state) + }, + "lcsc:normalizeProduct": { + version: "3", + parallelTasks: Infinity, + run: lcscNormalizeProduct(state) + }, } }; diff --git a/lib/lcsc/get-csrf-token.js b/lib/lcsc/get-csrf-token.js new file mode 100644 index 0000000..1ecb4f7 --- /dev/null +++ b/lib/lcsc/get-csrf-token.js @@ -0,0 +1,19 @@ +"use strict"; + +const Promise = require("bluebird"); + +const assureResponse = require("../shared/assure-response"); +const matchOrFail = require("../shared/match-or-fail"); + +module.exports = function ({ session }) { + return function getCSRFToken() { + return Promise.try(() => { + return session.get("https://lcsc.com/products"); + }).then((response) => { + assureResponse(response); + + let [ extractedToken ] = matchOrFail(/'X-CSRF-TOKEN': '([^']+)'/, response.body); + return extractedToken; + }); + }; +}; diff --git a/lib/lcsc/manufacturer-map.js b/lib/lcsc/manufacturer-map.js new file mode 100644 index 0000000..61f1275 --- /dev/null +++ b/lib/lcsc/manufacturer-map.js @@ -0,0 +1,37 @@ +"use strict"; + +// FIXME: Now that the manufacturer name is used in the item key, need to figure out a way to change those keys when it changes, eg. through a script + +module.exports = { + "realtek semicon": "Realtek", + "ruilon(shenzhen ruilongyuan elec)": "RUILON", + "changzhou huawei elec": "Changzhou Huawei", + "chengdu ashining tech": "Ashining", + "htc korea taejin tech": "HTC Korea / TAEJIN", + "utc(unisonic tech)": "Unisonic", + "umw(youtai semiconductor co., ltd.)": "UMW / Youtai", + "microchip tech": "Microchip", + "hrs(hirose)": "Hirose", + "mornsun guangzhou s& t": "Mornsun", + "bothhand enterprise": "Bothhand", + "shenzhen sunyuan tech": "Sunyuan", + "txc corp": "TXC", + + // From https://git.cryto.net/seekseek/scrape-documentation/issues/1 + "2Pai Semi": "2Pai Semi", + "3L COILS": "3L", + "3M": "3M", + "3PEAK": "3PEAK", + "(7Q-TEK)": "7Q-Tek", + "99IOT": "99 IoT", + "Aavid Thermalloy": "Aavid", + "ABLIC": "ABLIC / Seiko", + "ABOV Semicon": "ABOV", + "Abracon LLC": "Abracon", + "Acam Messelectronic Gmbh": "Acam", + "ACX": "ACX / CoorsTek", + "ADDtek Corp": "ADDtek", + "Adesto Technologies": "Adesto / Dialog", + "AD Semicon": "AD Semicon", + "Advanced Monolithic Systems": "AMS / Advanced Monolithic Systems", +}; diff --git a/lib/lcsc/task/find-categories.js b/lib/lcsc/task/find-categories.js new file mode 100644 index 0000000..0203b9a --- /dev/null +++ b/lib/lcsc/task/find-categories.js @@ -0,0 +1,60 @@ +"use strict"; + +const assert = require("assert"); + +const assureResponse = require("../../shared/assure-response"); + +module.exports = function lcscFindCategories(state) { + const withCsrfToken = require("../with-csrf-token")(state); + + let { session } = state; + + return async function ({ createItem }) { + let response = await withCsrfToken((token) => { + return session.post("https://lcsc.com/products/categories", { + manufacturer: "", + in_stock: "false", + is_RoHS: "false" + }, { + headers: { + "accept": "application/json, text/javascript, */*; q=0.01", + "X-CSRF-TOKEN": token + } + }); + }); + + assureResponse(response); + assert(response.body.data.data != null); + assert(response.body.code === 200); + + function processCategoryEntries(categories) { + for (let category of categories) { + createItem({ + id: `lcsc:category:${category.id}`, + tags: [ "lcsc:category" ], + data: { + ... category, + pageNumber: 1 + } + }); + + if (category.subs != null) { + processCategoryEntries(category.subs); + } + } + } + + // Listing is a {name: data} mapping of categories + processCategoryEntries(Object.values(response.body.data.data)); + // for (let category of Object.values(response.body.data.data)) { + // createItem({ + // id: `lcsc:category:${category.id}`, + // tags: [ "lcsc:category" ], + // data: { + // ... category, + // pageNumber: 1 + // } + // }); + // } + }; +}; diff --git a/lib/lcsc/task/normalize-product.js b/lib/lcsc/task/normalize-product.js new file mode 100644 index 0000000..b84d956 --- /dev/null +++ b/lib/lcsc/task/normalize-product.js @@ -0,0 +1,43 @@ +"use strict"; + +const matchValue = require("match-value"); +const mapObj = require("map-obj"); + +const normalizeString = require("../../shared/normalize-string"); +const manufacturerMap = require("../manufacturer-map"); +const pickBestOption = require("../../shared/pick-best-option"); + +let normalizedManufacturerMap = mapObj(manufacturerMap, (key, value) => { + return [ key.toLowerCase(), value ]; +}); + +module.exports = function lcscNormalizeProduct() { + return async function ({ data, createItem }) { + let url = normalizeString(data.datasheet.pdf); + let manufacturer = normalizeString(data.manufacturer.en); + let model = normalizeString(data.info.number); + let productID = normalizeString(data.number); + let description = normalizeString(data.description); + + let mappedManufacturer = (manufacturer != null) + ? matchValue(manufacturer.toLowerCase(), { + ... normalizedManufacturerMap, + _: manufacturer + }) + : null; + + if (url != null && model != null) { + createItem({ + id: `datasheet:${manufacturer}:${model}`, + update: (data) => pickBestOption(data, { + priority: 0.4, + manufacturer: mappedManufacturer, + productID: productID, + name: model, + description: description, + url: url + }) + }); + } + }; +}; diff --git a/lib/lcsc/task/scrape-category.js b/lib/lcsc/task/scrape-category.js new file mode 100644 index 0000000..f71adea --- /dev/null +++ b/lib/lcsc/task/scrape-category.js @@ -0,0 +1,73 @@ +"use strict"; + +const assert = require("assert"); + +const assureResponse = require("../../shared/assure-response"); + +// TODO: Validate response formats with validatem instead + +module.exports = function lcscScrapeCategory(state) { + const withCSRFToken = require("../with-csrf-token")(state); + + let { session } = state; + + return async function ({ data, createItem, deleteItem, updateData }) { + let response = await withCSRFToken((token) => { + return session.post(`https://lcsc.com/api/products/search`, { + current_page: String(data.pageNumber), + category: String(data.id), + in_stock: "false", + is_RoHS: "false", + show_icon: "false", + search_content: "", + limit: "10000" + }, { + headers: { + "accept": "application/json, text/javascript, */*; q=0.01", + "X-CSRF-TOKEN": token + } + }); + }); + + assureResponse(response); + assert(response.body.code === 200); + assert(response.body.result.data != null); + + for (let item of response.body.result.data) { + createItem({ + // NOTE: item.id seems like the database ID on the website, but item.number is the actual LCSC part number used internally for inventory management, so we use that for identification instead + id: `lcsc:product:${item.number}`, + tags: [ "lcsc:product" ], + data: item + }); + } + + if (data.pageNumber === 1) { + let totalPageCount = response.body.result.total_page; + assert(totalPageCount != null); + + updateData((data) => ({ + ... data, + pageCount: totalPageCount + })); + + // for (let i = 2; i <= totalPageCount; i++) { + // createItem({ + // id: `lcsc:category:${data.id}:page-${i}`, + // tags: [ "lcsc:category" ], + // data: { + // id: data.id, + // pageNumber: i + // } + // }); + // } + // FIXME: Figure out a workaround for the 10k-items-per-category cap + if (totalPageCount > 1) { + console.warn(`WARNING (LCSC): More than one page for category ${data.id}, but cannot paginate!`); + } + } else { + // We don't keep around items representing pages beyond the first, after indexing them, because total page count can change and the page numbers are not stable identifiers. We can just recreate them on the next scrape of the first page (which always exists). + deleteItem(); + } + }; +}; diff --git a/lib/lcsc/with-csrf-token.js b/lib/lcsc/with-csrf-token.js new file mode 100644 index 0000000..bb721e8 --- /dev/null +++ b/lib/lcsc/with-csrf-token.js @@ -0,0 +1,33 @@ +"use strict"; + + +// FIXME: This is a bit of a hack to persist the CSRF token across calls. There is probably a better solution for this, but that sort of state management needs to be handled on a scraping-server level, probably. +let token; + +module.exports = function (state) { + const getCsrfToken = require("./get-csrf-token")(state); + + return async function withCSRFToken(callback) { + async function obtainToken() { + token = await getCsrfToken(); + } + + async function attemptCallback() { + let response = await callback(token); + + if (response.statusCode === 419) { + await obtainToken(); + + return attemptCallback(); + } else { + return response; + } + } + + if (token == null) { + await obtainToken(); + } + + return attemptCallback(); + }; +}; diff --git a/lib/shared/match-or-fail.js b/lib/shared/match-or-fail.js new file mode 100644 index 0000000..7e94fb6 --- /dev/null +++ b/lib/shared/match-or-fail.js @@ -0,0 +1,11 @@ +"use strict"; + +module.exports = function matchOrFail(regex, string) { + let match = regex.exec(string); + + if (match != null) { + return match.slice(1); + } else { + throw new Error(`Failed to match regex ${regex}`); + } +}; diff --git a/lib/shared/normalize-string.js b/lib/shared/normalize-string.js new file mode 100644 index 0000000..9bb7660 --- /dev/null +++ b/lib/shared/normalize-string.js @@ -0,0 +1,15 @@ +"use strict"; + +module.exports = function normalizeString(string) { + if (string == null) { + return null; + } else { + let trimmed = string.trim(); + + if (trimmed.length === 0) { + return null; + } else { + return trimmed; + } + } +}; diff --git a/lib/shared/pick-best-option.js b/lib/shared/pick-best-option.js new file mode 100644 index 0000000..036d7b3 --- /dev/null +++ b/lib/shared/pick-best-option.js @@ -0,0 +1,14 @@ +"use strict"; + +module.exports = function pickBestOption(a, b) { + if (a == null || a.priority == null) { + return b; + } else if (b = null || b.priority == null) { + return a; + } else if (b.priority > a.priority) { + return b; + } else { + // NOTE: We return the first (ie. original) one in the case of a 'tie' + return a; + } +}; diff --git a/lib/st/task/normalize-product.js b/lib/st/task/normalize-product.js index b44d398..02ebafc 100644 --- a/lib/st/task/normalize-product.js +++ b/lib/st/task/normalize-product.js @@ -1,18 +1,26 @@ "use strict"; const htmlEntities = require("html-entities"); +const pickBestOption = require("../../shared/pick-best-option"); module.exports = function normalizeProduct() { return async function ({ data, createItem }) { + let manufacturer = "STMicroelectronics"; + let modelName = data.cellData["XJE010_VT-007"]; + let description = htmlEntities.decode(data.cellData["XJE014_VT-007"]); + let url = data.datasheetLink; + let productID = data.productId; + createItem({ - id: `datasheet:st:${data.productId}`, - data: { - manufacturer: "STMicroelectronics", - productID: data.productId, - name: data.cellData["XJE010_VT-007"], - description: htmlEntities.decode(data.cellData["XJE014_VT-007"]), - url: data.datasheetLink - } + id: `datasheet:${manufacturer}:${modelName}`, + update: (data) => pickBestOption(data, { + priority: 0.8, + manufacturer: manufacturer, + productID: productID, + name: modelName, + description: description, + url: url + }) }); }; }; diff --git a/package.json b/package.json index 17b0232..b641ffc 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,8 @@ "bluebird": "^3.7.2", "cheerio": "^1.0.0-rc.5", "html-entities": "^2.1.1", + "map-obj": "^4.2.0", + "match-value": "^1.1.0", "syncpipe": "^1.0.0" }, "devDependencies": { diff --git a/yarn.lock b/yarn.lock index 1ca28ab..a42a5bb 100644 --- a/yarn.lock +++ b/yarn.lock @@ -740,6 +740,16 @@ lru-cache@^6.0.0: dependencies: yallist "^4.0.0" +map-obj@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/map-obj/-/map-obj-4.2.0.tgz#0e8bc823e2aaca8a0942567d12ed14f389eec153" + integrity sha512-NAq0fCmZYGz9UFEQyndp7sisrow4GroyGeKluyKC/chuITZsPyOyC1UJZPJlVFImhXdROIP5xqouRLThT3BbpQ== + +match-value@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/match-value/-/match-value-1.1.0.tgz#ad311ef8bbe2d344a53ec3104e28fe221984b98e" + integrity sha512-NOvpobcmkX+l9Eb6r2s3BkR1g1ZwzExDFdXA9d6p1r1O1olLbo88KuzMiBmg43xSpodfm7I6Hqlx2OoySquEgg== + mime@^1.3.4: version "1.6.0" resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1"