diff --git a/index.js b/index.js index 8562a50..e401828 100644 --- a/index.js +++ b/index.js @@ -92,6 +92,7 @@ module.exports = { }, // LCSC + // FIXME: Commenting out a bunch of tasks but not removing them from the tag assignments will result in an error, but will *not* exit the program. That's probably not right? "lcsc:findCategories": { ttl: "30d", taskVersion: "1", diff --git a/lib/lcsc/get-csrf-token.js b/lib/lcsc/get-csrf-token.js deleted file mode 100644 index 1ecb4f7..0000000 --- a/lib/lcsc/get-csrf-token.js +++ /dev/null @@ -1,19 +0,0 @@ -"use strict"; - -const Promise = require("bluebird"); - -const assureResponse = require("../shared/assure-response"); -const matchOrFail = require("../shared/match-or-fail"); - -module.exports = function ({ session }) { - return function getCSRFToken() { - return Promise.try(() => { - return session.get("https://lcsc.com/products"); - }).then((response) => { - assureResponse(response); - - let [ extractedToken ] = matchOrFail(/'X-CSRF-TOKEN': '([^']+)'/, response.body); - return extractedToken; - }); - }; -}; diff --git a/lib/lcsc/task/find-categories.js b/lib/lcsc/task/find-categories.js index 0203b9a..2844ab8 100644 --- a/lib/lcsc/task/find-categories.js +++ b/lib/lcsc/task/find-categories.js @@ -5,56 +5,38 @@ const assert = require("assert"); const assureResponse = require("../../shared/assure-response"); module.exports = function lcscFindCategories(state) { - const withCsrfToken = require("../with-csrf-token")(state); - let { session } = state; return async function ({ createItem }) { - let response = await withCsrfToken((token) => { - return session.post("https://lcsc.com/products/categories", { - manufacturer: "", - in_stock: "false", - is_RoHS: "false" - }, { - headers: { - "accept": "application/json, text/javascript, */*; q=0.01", - "X-CSRF-TOKEN": token - } - }); - }); + let response = await session.get("https://wwwapi.lcsc.com/v1/home/category"); assureResponse(response); - assert(response.body.data.data != null); - assert(response.body.code === 200); + assert(response.body.length > 0); + assert(response.statusCode === 200); function processCategoryEntries(categories) { for (let category of categories) { - createItem({ - id: `lcsc:category:${category.id}`, - tags: [ "lcsc:category" ], - data: { - ... category, - pageNumber: 1 - } - }); - - if (category.subs != null) { - processCategoryEntries(category.subs); + let productCount = category.productNum; + let pageCount = Math.ceil(productCount / 500); + + // NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available. + for (let i = 1; i <= pageCount; i++) { + createItem({ + id: `lcsc:category:${category.catalogId}:page-${i}`, + tags: [ "lcsc:category" ], + data: { + ... category, + pageNumber: i + } + }); + } + + if (category.childCatelogs != null) { + processCategoryEntries(category.childCatelogs); } } } - // Listing is a {name: data} mapping of categories - processCategoryEntries(Object.values(response.body.data.data)); - // for (let category of Object.values(response.body.data.data)) { - // createItem({ - // id: `lcsc:category:${category.id}`, - // tags: [ "lcsc:category" ], - // data: { - // ... category, - // pageNumber: 1 - // } - // }); - // } + processCategoryEntries(response.body); }; }; diff --git a/lib/lcsc/task/normalize-product.js b/lib/lcsc/task/normalize-product.js index f7f8173..ac5db0d 100644 --- a/lib/lcsc/task/normalize-product.js +++ b/lib/lcsc/task/normalize-product.js @@ -9,11 +9,11 @@ module.exports = function lcscNormalizeProduct() { createDatasheet(api, { priority: 0.4, source: "lcsc", - manufacturer: data.manufacturer.en, - productID: data.number, - name: data.info.number, - description: data.description, - url: data.datasheet.pdf + manufacturer: data.brandNameEn, + productID: data.productCode, + name: data.productModel, + description: data.productIntroEn, + url: data.pdfUrl }); }; }; diff --git a/lib/lcsc/task/scrape-category.js b/lib/lcsc/task/scrape-category.js index f71adea..ea65f74 100644 --- a/lib/lcsc/task/scrape-category.js +++ b/lib/lcsc/task/scrape-category.js @@ -7,67 +7,30 @@ const assureResponse = require("../../shared/assure-response"); // TODO: Validate response formats with validatem instead module.exports = function lcscScrapeCategory(state) { - const withCSRFToken = require("../with-csrf-token")(state); - let { session } = state; return async function ({ data, createItem, deleteItem, updateData }) { - let response = await withCSRFToken((token) => { - return session.post(`https://lcsc.com/api/products/search`, { - current_page: String(data.pageNumber), - category: String(data.id), - in_stock: "false", - is_RoHS: "false", - show_icon: "false", - search_content: "", - limit: "10000" - }, { - headers: { - "accept": "application/json, text/javascript, */*; q=0.01", - "X-CSRF-TOKEN": token - } - }); + let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, { + catalogIdList: [ data.catalogId ], + currentPage: data.pageNumber, + pageSize: 500, + paramNameValueMap: {} }); assureResponse(response); - assert(response.body.code === 200); - assert(response.body.result.data != null); + assert(response.statusCode === 200); + assert(response.body.productList.length > 0); - for (let item of response.body.result.data) { + for (let item of response.body.productList) { createItem({ - // NOTE: item.id seems like the database ID on the website, but item.number is the actual LCSC part number used internally for inventory management, so we use that for identification instead - id: `lcsc:product:${item.number}`, + // NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead + id: `lcsc:product:${item.productCode}`, tags: [ "lcsc:product" ], data: item }); } - if (data.pageNumber === 1) { - let totalPageCount = response.body.result.total_page; - assert(totalPageCount != null); - - updateData((data) => ({ - ... data, - pageCount: totalPageCount - })); - - // for (let i = 2; i <= totalPageCount; i++) { - // createItem({ - // id: `lcsc:category:${data.id}:page-${i}`, - // tags: [ "lcsc:category" ], - // data: { - // id: data.id, - // pageNumber: i - // } - // }); - // } - // FIXME: Figure out a workaround for the 10k-items-per-category cap - if (totalPageCount > 1) { - console.warn(`WARNING (LCSC): More than one page for category ${data.id}, but cannot paginate!`); - } - } else { - // We don't keep around items representing pages beyond the first, after indexing them, because total page count can change and the page numbers are not stable identifiers. We can just recreate them on the next scrape of the first page (which always exists). - deleteItem(); - } + // We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway. + deleteItem(); }; }; diff --git a/lib/lcsc/with-csrf-token.js b/lib/lcsc/with-csrf-token.js deleted file mode 100644 index bb721e8..0000000 --- a/lib/lcsc/with-csrf-token.js +++ /dev/null @@ -1,33 +0,0 @@ -"use strict"; - - -// FIXME: This is a bit of a hack to persist the CSRF token across calls. There is probably a better solution for this, but that sort of state management needs to be handled on a scraping-server level, probably. -let token; - -module.exports = function (state) { - const getCsrfToken = require("./get-csrf-token")(state); - - return async function withCSRFToken(callback) { - async function obtainToken() { - token = await getCsrfToken(); - } - - async function attemptCallback() { - let response = await callback(token); - - if (response.statusCode === 419) { - await obtainToken(); - - return attemptCallback(); - } else { - return response; - } - } - - if (token == null) { - await obtainToken(); - } - - return attemptCallback(); - }; -}; diff --git a/lib/shared/map-manufacturer/mapping.js b/lib/shared/map-manufacturer/mapping.js index cb5348d..8857cec 100644 --- a/lib/shared/map-manufacturer/mapping.js +++ b/lib/shared/map-manufacturer/mapping.js @@ -1,6 +1,7 @@ "use strict"; // FIXME: Now that the manufacturer name is used in the item key, need to figure out a way to change those keys when it changes, eg. through a script +// FIXME: Figure out how to handle "Abb - Thomas & Betts" and "Abb - Kopex" from Farnell, which describes an acquisition. There should probably be some "also known as" feature in the search UI? module.exports = { "realtek semicon": "Realtek",