Browse Source

Add LCSC scraper, add priority logic for datasheet entry normalization

master
Sven Slootweg 1 month ago
parent
commit
cc568ab80a
13 changed files with 368 additions and 18 deletions
  1. +35
    -10
      index.js
  2. +19
    -0
      lib/lcsc/get-csrf-token.js
  3. +37
    -0
      lib/lcsc/manufacturer-map.js
  4. +60
    -0
      lib/lcsc/task/find-categories.js
  5. +43
    -0
      lib/lcsc/task/normalize-product.js
  6. +73
    -0
      lib/lcsc/task/scrape-category.js
  7. +33
    -0
      lib/lcsc/with-csrf-token.js
  8. +11
    -0
      lib/shared/match-or-fail.js
  9. +15
    -0
      lib/shared/normalize-string.js
  10. +14
    -0
      lib/shared/pick-best-option.js
  11. +16
    -8
      lib/st/task/normalize-product.js
  12. +2
    -0
      package.json
  13. +10
    -0
      yarn.lock

+ 35
- 10
index.js View File

@ -6,14 +6,17 @@ const stScrapeCategory = require("./lib/st/task/scrape-category");
const stFindCategories = require("./lib/st/task/find-categories");
const stScrapeProduct = require("./lib/st/task/scrape-product");
const stNormalizeProduct = require("./lib/st/task/normalize-product");
const lcscFindCategories = require("./lib/lcsc/task/find-categories");
const lcscScrapeCategory = require("./lib/lcsc/task/scrape-category");
const lcscNormalizeProduct = require("./lib/lcsc/task/normalize-product");
let session = bhttp.session({
headers: {
"user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"
}
});
let state = { session };
let state = {
session: bhttp.session({
headers: {
"user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"
}
})
};
module.exports = {
database: {
@ -24,11 +27,18 @@ module.exports = {
id: "st:home",
tags: [ "st:home" ],
data: {}
}, {
id: "lcsc:home",
tags: [ "lcsc:home" ],
data: {}
}],
tags: {
"st:home": [ "st:findCategories" ],
"st:category": [ "st:scrapeCategory" ],
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ]
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ],
"lcsc:home": [ "lcsc:findCategories" ],
"lcsc:category": [ "lcsc:scrapeCategory" ],
"lcsc:product": [ "lcsc:normalizeProduct" ]
},
tasks: {
"st:findCategories": {
@ -48,8 +58,23 @@ module.exports = {
},
"st:normalizeProduct": {
dependsOn: [ "st:scrapeProduct" ],
version: "3",
version: "4",
run: stNormalizeProduct(state)
}
},
"lcsc:findCategories": {
ttl: "30d",
taskVersion: "1",
run: lcscFindCategories(state)
},
"lcsc:scrapeCategory": {
ttl: "30d",
taskInterval: "1m",
run: lcscScrapeCategory(state)
},
"lcsc:normalizeProduct": {
version: "3",
parallelTasks: Infinity,
run: lcscNormalizeProduct(state)
},
}
};

+ 19
- 0
lib/lcsc/get-csrf-token.js View File

@ -0,0 +1,19 @@
"use strict";
const Promise = require("bluebird");
const assureResponse = require("../shared/assure-response");
const matchOrFail = require("../shared/match-or-fail");
module.exports = function ({ session }) {
return function getCSRFToken() {
return Promise.try(() => {
return session.get("https://lcsc.com/products");
}).then((response) => {
assureResponse(response);
let [ extractedToken ] = matchOrFail(/'X-CSRF-TOKEN': '([^']+)'/, response.body);
return extractedToken;
});
};
};

+ 37
- 0
lib/lcsc/manufacturer-map.js View File

@ -0,0 +1,37 @@
"use strict";
// FIXME: Now that the manufacturer name is used in the item key, need to figure out a way to change those keys when it changes, eg. through a script
module.exports = {
"realtek semicon": "Realtek",
"ruilon(shenzhen ruilongyuan elec)": "RUILON",
"changzhou huawei elec": "Changzhou Huawei",
"chengdu ashining tech": "Ashining",
"htc korea taejin tech": "HTC Korea / TAEJIN",
"utc(unisonic tech)": "Unisonic",
"umw(youtai semiconductor co., ltd.)": "UMW / Youtai",
"microchip tech": "Microchip",
"hrs(hirose)": "Hirose",
"mornsun guangzhou s& t": "Mornsun",
"bothhand enterprise": "Bothhand",
"shenzhen sunyuan tech": "Sunyuan",
"txc corp": "TXC",
// From https://git.cryto.net/seekseek/scrape-documentation/issues/1
"2Pai Semi": "2Pai Semi",
"3L COILS": "3L",
"3M": "3M",
"3PEAK": "3PEAK",
"(7Q-TEK)": "7Q-Tek",
"99IOT": "99 IoT",
"Aavid Thermalloy": "Aavid",
"ABLIC": "ABLIC / Seiko",
"ABOV Semicon": "ABOV",
"Abracon LLC": "Abracon",
"Acam Messelectronic Gmbh": "Acam",
"ACX": "ACX / CoorsTek",
"ADDtek Corp": "ADDtek",
"Adesto Technologies": "Adesto / Dialog",
"AD Semicon": "AD Semicon",
"Advanced Monolithic Systems": "AMS / Advanced Monolithic Systems",
};

+ 60
- 0
lib/lcsc/task/find-categories.js View File

@ -0,0 +1,60 @@
"use strict";
const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
module.exports = function lcscFindCategories(state) {
const withCsrfToken = require("../with-csrf-token")(state);
let { session } = state;
return async function ({ createItem }) {
let response = await withCsrfToken((token) => {
return session.post("https://lcsc.com/products/categories", {
manufacturer: "",
in_stock: "false",
is_RoHS: "false"
}, {
headers: {
"accept": "application/json, text/javascript, */*; q=0.01",
"X-CSRF-TOKEN": token
}
});
});
assureResponse(response);
assert(response.body.data.data != null);
assert(response.body.code === 200);
function processCategoryEntries(categories) {
for (let category of categories) {
createItem({
id: `lcsc:category:${category.id}`,
tags: [ "lcsc:category" ],
data: {
... category,
pageNumber: 1
}
});
if (category.subs != null) {
processCategoryEntries(category.subs);
}
}
}
// Listing is a {name: data} mapping of categories
processCategoryEntries(Object.values(response.body.data.data));
// for (let category of Object.values(response.body.data.data)) {
// createItem({
// id: `lcsc:category:${category.id}`,
// tags: [ "lcsc:category" ],
// data: {
// ... category,
// pageNumber: 1
// }
// });
// }
};
};

+ 43
- 0
lib/lcsc/task/normalize-product.js View File

@ -0,0 +1,43 @@
"use strict";
const matchValue = require("match-value");
const mapObj = require("map-obj");
const normalizeString = require("../../shared/normalize-string");
const manufacturerMap = require("../manufacturer-map");
const pickBestOption = require("../../shared/pick-best-option");
let normalizedManufacturerMap = mapObj(manufacturerMap, (key, value) => {
return [ key.toLowerCase(), value ];
});
module.exports = function lcscNormalizeProduct() {
return async function ({ data, createItem }) {
let url = normalizeString(data.datasheet.pdf);
let manufacturer = normalizeString(data.manufacturer.en);
let model = normalizeString(data.info.number);
let productID = normalizeString(data.number);
let description = normalizeString(data.description);
let mappedManufacturer = (manufacturer != null)
? matchValue(manufacturer.toLowerCase(), {
... normalizedManufacturerMap,
_: manufacturer
})
: null;
if (url != null && model != null) {
createItem({
id: `datasheet:${manufacturer}:${model}`,
update: (data) => pickBestOption(data, {
priority: 0.4,
manufacturer: mappedManufacturer,
productID: productID,
name: model,
description: description,
url: url
})
});
}
};
};

+ 73
- 0
lib/lcsc/task/scrape-category.js View File

@ -0,0 +1,73 @@
"use strict";
const assert = require("assert");
const assureResponse = require("../../shared/assure-response");
// TODO: Validate response formats with validatem instead
module.exports = function lcscScrapeCategory(state) {
const withCSRFToken = require("../with-csrf-token")(state);
let { session } = state;
return async function ({ data, createItem, deleteItem, updateData }) {
let response = await withCSRFToken((token) => {
return session.post(`https://lcsc.com/api/products/search`, {
current_page: String(data.pageNumber),
category: String(data.id),
in_stock: "false",
is_RoHS: "false",
show_icon: "false",
search_content: "",
limit: "10000"
}, {
headers: {
"accept": "application/json, text/javascript, */*; q=0.01",
"X-CSRF-TOKEN": token
}
});
});
assureResponse(response);
assert(response.body.code === 200);
assert(response.body.result.data != null);
for (let item of response.body.result.data) {
createItem({
// NOTE: item.id seems like the database ID on the website, but item.number is the actual LCSC part number used internally for inventory management, so we use that for identification instead
id: `lcsc:product:${item.number}`,
tags: [ "lcsc:product" ],
data: item
});
}
if (data.pageNumber === 1) {
let totalPageCount = response.body.result.total_page;
assert(totalPageCount != null);
updateData((data) => ({
... data,
pageCount: totalPageCount
}));
// for (let i = 2; i <= totalPageCount; i++) {
// createItem({
// id: `lcsc:category:${data.id}:page-${i}`,
// tags: [ "lcsc:category" ],
// data: {
// id: data.id,
// pageNumber: i
// }
// });
// }
// FIXME: Figure out a workaround for the 10k-items-per-category cap
if (totalPageCount > 1) {
console.warn(`WARNING (LCSC): More than one page for category ${data.id}, but cannot paginate!`);
}
} else {
// We don't keep around items representing pages beyond the first, after indexing them, because total page count can change and the page numbers are not stable identifiers. We can just recreate them on the next scrape of the first page (which always exists).
deleteItem();
}
};
};

+ 33
- 0
lib/lcsc/with-csrf-token.js View File

@ -0,0 +1,33 @@
"use strict";
// FIXME: This is a bit of a hack to persist the CSRF token across calls. There is probably a better solution for this, but that sort of state management needs to be handled on a scraping-server level, probably.
let token;
module.exports = function (state) {
const getCsrfToken = require("./get-csrf-token")(state);
return async function withCSRFToken(callback) {
async function obtainToken() {
token = await getCsrfToken();
}
async function attemptCallback() {
let response = await callback(token);
if (response.statusCode === 419) {
await obtainToken();
return attemptCallback();
} else {
return response;
}
}
if (token == null) {
await obtainToken();
}
return attemptCallback();
};
};

+ 11
- 0
lib/shared/match-or-fail.js View File

@ -0,0 +1,11 @@
"use strict";
module.exports = function matchOrFail(regex, string) {
let match = regex.exec(string);
if (match != null) {
return match.slice(1);
} else {
throw new Error(`Failed to match regex ${regex}`);
}
};

+ 15
- 0
lib/shared/normalize-string.js View File

@ -0,0 +1,15 @@
"use strict";
module.exports = function normalizeString(string) {
if (string == null) {
return null;
} else {
let trimmed = string.trim();
if (trimmed.length === 0) {
return null;
} else {
return trimmed;
}
}
};

+ 14
- 0
lib/shared/pick-best-option.js View File

@ -0,0 +1,14 @@
"use strict";
module.exports = function pickBestOption(a, b) {
if (a == null || a.priority == null) {
return b;
} else if (b = null || b.priority == null) {
return a;
} else if (b.priority > a.priority) {
return b;
} else {
// NOTE: We return the first (ie. original) one in the case of a 'tie'
return a;
}
};

+ 16
- 8
lib/st/task/normalize-product.js View File

@ -1,18 +1,26 @@
"use strict";
const htmlEntities = require("html-entities");
const pickBestOption = require("../../shared/pick-best-option");
module.exports = function normalizeProduct() {
return async function ({ data, createItem }) {
let manufacturer = "STMicroelectronics";
let modelName = data.cellData["XJE010_VT-007"];
let description = htmlEntities.decode(data.cellData["XJE014_VT-007"]);
let url = data.datasheetLink;
let productID = data.productId;
createItem({
id: `datasheet:st:${data.productId}`,
data: {
manufacturer: "STMicroelectronics",
productID: data.productId,
name: data.cellData["XJE010_VT-007"],
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
url: data.datasheetLink
}
id: `datasheet:${manufacturer}:${modelName}`,
update: (data) => pickBestOption(data, {
priority: 0.8,
manufacturer: manufacturer,
productID: productID,
name: modelName,
description: description,
url: url
})
});
};
};

+ 2
- 0
package.json View File

@ -10,6 +10,8 @@
"bluebird": "^3.7.2",
"cheerio": "^1.0.0-rc.5",
"html-entities": "^2.1.1",
"map-obj": "^4.2.0",
"match-value": "^1.1.0",
"syncpipe": "^1.0.0"
},
"devDependencies": {


+ 10
- 0
yarn.lock View File

@ -740,6 +740,16 @@ lru-cache@^6.0.0:
dependencies:
yallist "^4.0.0"
map-obj@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/map-obj/-/map-obj-4.2.0.tgz#0e8bc823e2aaca8a0942567d12ed14f389eec153"
integrity sha512-NAq0fCmZYGz9UFEQyndp7sisrow4GroyGeKluyKC/chuITZsPyOyC1UJZPJlVFImhXdROIP5xqouRLThT3BbpQ==
match-value@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/match-value/-/match-value-1.1.0.tgz#ad311ef8bbe2d344a53ec3104e28fe221984b98e"
integrity sha512-NOvpobcmkX+l9Eb6r2s3BkR1g1ZwzExDFdXA9d6p1r1O1olLbo88KuzMiBmg43xSpodfm7I6Hqlx2OoySquEgg==
mime@^1.3.4:
version "1.6.0"
resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1"


Loading…
Cancel
Save