Browse Source

Fix LCSC scraper

master
Sven Slootweg 7 months ago
parent
commit
cbb1e11fb2
  1. 1
      index.js
  2. 19
      lib/lcsc/get-csrf-token.js
  3. 60
      lib/lcsc/task/find-categories.js
  4. 10
      lib/lcsc/task/normalize-product.js
  5. 61
      lib/lcsc/task/scrape-category.js
  6. 33
      lib/lcsc/with-csrf-token.js
  7. 1
      lib/shared/map-manufacturer/mapping.js

1
index.js

@ -92,6 +92,7 @@ module.exports = {
},
// LCSC
// FIXME: Commenting out a bunch of tasks but not removing them from the tag assignments will result in an error, but will *not* exit the program. That's probably not right?
"lcsc:findCategories": {
ttl: "30d",
taskVersion: "1",

19
lib/lcsc/get-csrf-token.js

@ -1,19 +0,0 @@
"use strict";
const Promise = require("bluebird");
const assureResponse = require("../shared/assure-response");
const matchOrFail = require("../shared/match-or-fail");
module.exports = function ({ session }) {
return function getCSRFToken() {
return Promise.try(() => {
return session.get("https://lcsc.com/products");
}).then((response) => {
assureResponse(response);
let [ extractedToken ] = matchOrFail(/'X-CSRF-TOKEN': '([^']+)'/, response.body);
return extractedToken;
});
};
};

60
lib/lcsc/task/find-categories.js

@ -5,56 +5,38 @@ const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
module.exports = function lcscFindCategories(state) {
const withCsrfToken = require("../with-csrf-token")(state);
let { session } = state;
return async function ({ createItem }) {
let response = await withCsrfToken((token) => {
return session.post("https://lcsc.com/products/categories", {
manufacturer: "",
in_stock: "false",
is_RoHS: "false"
}, {
headers: {
"accept": "application/json, text/javascript, */*; q=0.01",
"X-CSRF-TOKEN": token
}
});
});
let response = await session.get("https://wwwapi.lcsc.com/v1/home/category");
assureResponse(response);
assert(response.body.data.data != null);
assert(response.body.code === 200);
assert(response.body.length > 0);
assert(response.statusCode === 200);
function processCategoryEntries(categories) {
for (let category of categories) {
createItem({
id: `lcsc:category:${category.id}`,
tags: [ "lcsc:category" ],
data: {
... category,
pageNumber: 1
}
});
if (category.subs != null) {
processCategoryEntries(category.subs);
let productCount = category.productNum;
let pageCount = Math.ceil(productCount / 500);
// NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available.
for (let i = 1; i <= pageCount; i++) {
createItem({
id: `lcsc:category:${category.catalogId}:page-${i}`,
tags: [ "lcsc:category" ],
data: {
... category,
pageNumber: i
}
});
}
if (category.childCatelogs != null) {
processCategoryEntries(category.childCatelogs);
}
}
}
// Listing is a {name: data} mapping of categories
processCategoryEntries(Object.values(response.body.data.data));
// for (let category of Object.values(response.body.data.data)) {
// createItem({
// id: `lcsc:category:${category.id}`,
// tags: [ "lcsc:category" ],
// data: {
// ... category,
// pageNumber: 1
// }
// });
// }
processCategoryEntries(response.body);
};
};

10
lib/lcsc/task/normalize-product.js

@ -9,11 +9,11 @@ module.exports = function lcscNormalizeProduct() {
createDatasheet(api, {
priority: 0.4,
source: "lcsc",
manufacturer: data.manufacturer.en,
productID: data.number,
name: data.info.number,
description: data.description,
url: data.datasheet.pdf
manufacturer: data.brandNameEn,
productID: data.productCode,
name: data.productModel,
description: data.productIntroEn,
url: data.pdfUrl
});
};
};

61
lib/lcsc/task/scrape-category.js

@ -7,67 +7,30 @@ const assureResponse = require("../../shared/assure-response");
// TODO: Validate response formats with validatem instead
module.exports = function lcscScrapeCategory(state) {
const withCSRFToken = require("../with-csrf-token")(state);
let { session } = state;
return async function ({ data, createItem, deleteItem, updateData }) {
let response = await withCSRFToken((token) => {
return session.post(`https://lcsc.com/api/products/search`, {
current_page: String(data.pageNumber),
category: String(data.id),
in_stock: "false",
is_RoHS: "false",
show_icon: "false",
search_content: "",
limit: "10000"
}, {
headers: {
"accept": "application/json, text/javascript, */*; q=0.01",
"X-CSRF-TOKEN": token
}
});
let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, {
catalogIdList: [ data.catalogId ],
currentPage: data.pageNumber,
pageSize: 500,
paramNameValueMap: {}
});
assureResponse(response);
assert(response.body.code === 200);
assert(response.body.result.data != null);
assert(response.statusCode === 200);
assert(response.body.productList.length > 0);
for (let item of response.body.result.data) {
for (let item of response.body.productList) {
createItem({
// NOTE: item.id seems like the database ID on the website, but item.number is the actual LCSC part number used internally for inventory management, so we use that for identification instead
id: `lcsc:product:${item.number}`,
// NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead
id: `lcsc:product:${item.productCode}`,
tags: [ "lcsc:product" ],
data: item
});
}
if (data.pageNumber === 1) {
let totalPageCount = response.body.result.total_page;
assert(totalPageCount != null);
updateData((data) => ({
... data,
pageCount: totalPageCount
}));
// for (let i = 2; i <= totalPageCount; i++) {
// createItem({
// id: `lcsc:category:${data.id}:page-${i}`,
// tags: [ "lcsc:category" ],
// data: {
// id: data.id,
// pageNumber: i
// }
// });
// }
// FIXME: Figure out a workaround for the 10k-items-per-category cap
if (totalPageCount > 1) {
console.warn(`WARNING (LCSC): More than one page for category ${data.id}, but cannot paginate!`);
}
} else {
// We don't keep around items representing pages beyond the first, after indexing them, because total page count can change and the page numbers are not stable identifiers. We can just recreate them on the next scrape of the first page (which always exists).
deleteItem();
}
// We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway.
deleteItem();
};
};

33
lib/lcsc/with-csrf-token.js

@ -1,33 +0,0 @@
"use strict";
// FIXME: This is a bit of a hack to persist the CSRF token across calls. There is probably a better solution for this, but that sort of state management needs to be handled on a scraping-server level, probably.
let token;
module.exports = function (state) {
const getCsrfToken = require("./get-csrf-token")(state);
return async function withCSRFToken(callback) {
async function obtainToken() {
token = await getCsrfToken();
}
async function attemptCallback() {
let response = await callback(token);
if (response.statusCode === 419) {
await obtainToken();
return attemptCallback();
} else {
return response;
}
}
if (token == null) {
await obtainToken();
}
return attemptCallback();
};
};

1
lib/shared/map-manufacturer/mapping.js

@ -1,6 +1,7 @@
"use strict";
// FIXME: Now that the manufacturer name is used in the item key, need to figure out a way to change those keys when it changes, eg. through a script
// FIXME: Figure out how to handle "Abb - Thomas & Betts" and "Abb - Kopex" from Farnell, which describes an acquisition. There should probably be some "also known as" feature in the search UI?
module.exports = {
"realtek semicon": "Realtek",

Loading…
Cancel
Save