Add LCSC scraper, add priority logic for datasheet entry normalization
parent
6c66d7f070
commit
cc568ab80a
@ -0,0 +1,19 @@
|
||||
"use strict";
|
||||
|
||||
const Promise = require("bluebird");
|
||||
|
||||
const assureResponse = require("../shared/assure-response");
|
||||
const matchOrFail = require("../shared/match-or-fail");
|
||||
|
||||
module.exports = function ({ session }) {
|
||||
return function getCSRFToken() {
|
||||
return Promise.try(() => {
|
||||
return session.get("https://lcsc.com/products");
|
||||
}).then((response) => {
|
||||
assureResponse(response);
|
||||
|
||||
let [ extractedToken ] = matchOrFail(/'X-CSRF-TOKEN': '([^']+)'/, response.body);
|
||||
return extractedToken;
|
||||
});
|
||||
};
|
||||
};
|
@ -0,0 +1,60 @@
|
||||
"use strict";
|
||||
|
||||
const assert = require("assert");
|
||||
|
||||
const assureResponse = require("../../shared/assure-response");
|
||||
|
||||
module.exports = function lcscFindCategories(state) {
|
||||
const withCsrfToken = require("../with-csrf-token")(state);
|
||||
|
||||
let { session } = state;
|
||||
|
||||
return async function ({ createItem }) {
|
||||
let response = await withCsrfToken((token) => {
|
||||
return session.post("https://lcsc.com/products/categories", {
|
||||
manufacturer: "",
|
||||
in_stock: "false",
|
||||
is_RoHS: "false"
|
||||
}, {
|
||||
headers: {
|
||||
"accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-CSRF-TOKEN": token
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
assureResponse(response);
|
||||
assert(response.body.data.data != null);
|
||||
assert(response.body.code === 200);
|
||||
|
||||
function processCategoryEntries(categories) {
|
||||
for (let category of categories) {
|
||||
createItem({
|
||||
id: `lcsc:category:${category.id}`,
|
||||
tags: [ "lcsc:category" ],
|
||||
data: {
|
||||
... category,
|
||||
pageNumber: 1
|
||||
}
|
||||
});
|
||||
|
||||
if (category.subs != null) {
|
||||
processCategoryEntries(category.subs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Listing is a {name: data} mapping of categories
|
||||
processCategoryEntries(Object.values(response.body.data.data));
|
||||
// for (let category of Object.values(response.body.data.data)) {
|
||||
// createItem({
|
||||
// id: `lcsc:category:${category.id}`,
|
||||
// tags: [ "lcsc:category" ],
|
||||
// data: {
|
||||
// ... category,
|
||||
// pageNumber: 1
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
};
|
||||
};
|
@ -0,0 +1,43 @@
|
||||
"use strict";
|
||||
|
||||
const matchValue = require("match-value");
|
||||
const mapObj = require("map-obj");
|
||||
|
||||
const normalizeString = require("../../shared/normalize-string");
|
||||
const manufacturerMap = require("../manufacturer-map");
|
||||
const pickBestOption = require("../../shared/pick-best-option");
|
||||
|
||||
let normalizedManufacturerMap = mapObj(manufacturerMap, (key, value) => {
|
||||
return [ key.toLowerCase(), value ];
|
||||
});
|
||||
|
||||
module.exports = function lcscNormalizeProduct() {
|
||||
return async function ({ data, createItem }) {
|
||||
let url = normalizeString(data.datasheet.pdf);
|
||||
let manufacturer = normalizeString(data.manufacturer.en);
|
||||
let model = normalizeString(data.info.number);
|
||||
let productID = normalizeString(data.number);
|
||||
let description = normalizeString(data.description);
|
||||
|
||||
let mappedManufacturer = (manufacturer != null)
|
||||
? matchValue(manufacturer.toLowerCase(), {
|
||||
... normalizedManufacturerMap,
|
||||
_: manufacturer
|
||||
})
|
||||
: null;
|
||||
|
||||
if (url != null && model != null) {
|
||||
createItem({
|
||||
id: `datasheet:${manufacturer}:${model}`,
|
||||
update: (data) => pickBestOption(data, {
|
||||
priority: 0.4,
|
||||
manufacturer: mappedManufacturer,
|
||||
productID: productID,
|
||||
name: model,
|
||||
description: description,
|
||||
url: url
|
||||
})
|
||||
});
|
||||
}
|
||||
};
|
||||
};
|
@ -0,0 +1,73 @@
|
||||
"use strict";
|
||||
|
||||
const assert = require("assert");
|
||||
|
||||
const assureResponse = require("../../shared/assure-response");
|
||||
|
||||
// TODO: Validate response formats with validatem instead
|
||||
|
||||
module.exports = function lcscScrapeCategory(state) {
|
||||
const withCSRFToken = require("../with-csrf-token")(state);
|
||||
|
||||
let { session } = state;
|
||||
|
||||
return async function ({ data, createItem, deleteItem, updateData }) {
|
||||
let response = await withCSRFToken((token) => {
|
||||
return session.post(`https://lcsc.com/api/products/search`, {
|
||||
current_page: String(data.pageNumber),
|
||||
category: String(data.id),
|
||||
in_stock: "false",
|
||||
is_RoHS: "false",
|
||||
show_icon: "false",
|
||||
search_content: "",
|
||||
limit: "10000"
|
||||
}, {
|
||||
headers: {
|
||||
"accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-CSRF-TOKEN": token
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
assureResponse(response);
|
||||
assert(response.body.code === 200);
|
||||
assert(response.body.result.data != null);
|
||||
|
||||
for (let item of response.body.result.data) {
|
||||
createItem({
|
||||
// NOTE: item.id seems like the database ID on the website, but item.number is the actual LCSC part number used internally for inventory management, so we use that for identification instead
|
||||
id: `lcsc:product:${item.number}`,
|
||||
tags: [ "lcsc:product" ],
|
||||
data: item
|
||||
});
|
||||
}
|
||||
|
||||
if (data.pageNumber === 1) {
|
||||
let totalPageCount = response.body.result.total_page;
|
||||
assert(totalPageCount != null);
|
||||
|
||||
updateData((data) => ({
|
||||
... data,
|
||||
pageCount: totalPageCount
|
||||
}));
|
||||
|
||||
// for (let i = 2; i <= totalPageCount; i++) {
|
||||
// createItem({
|
||||
// id: `lcsc:category:${data.id}:page-${i}`,
|
||||
// tags: [ "lcsc:category" ],
|
||||
// data: {
|
||||
// id: data.id,
|
||||
// pageNumber: i
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
// FIXME: Figure out a workaround for the 10k-items-per-category cap
|
||||
if (totalPageCount > 1) {
|
||||
console.warn(`WARNING (LCSC): More than one page for category ${data.id}, but cannot paginate!`);
|
||||
}
|
||||
} else {
|
||||
// We don't keep around items representing pages beyond the first, after indexing them, because total page count can change and the page numbers are not stable identifiers. We can just recreate them on the next scrape of the first page (which always exists).
|
||||
deleteItem();
|
||||
}
|
||||
};
|
||||
};
|
@ -0,0 +1,33 @@
|
||||
"use strict";
|
||||
|
||||
|
||||
// FIXME: This is a bit of a hack to persist the CSRF token across calls. There is probably a better solution for this, but that sort of state management needs to be handled on a scraping-server level, probably.
|
||||
let token;
|
||||
|
||||
module.exports = function (state) {
|
||||
const getCsrfToken = require("./get-csrf-token")(state);
|
||||
|
||||
return async function withCSRFToken(callback) {
|
||||
async function obtainToken() {
|
||||
token = await getCsrfToken();
|
||||
}
|
||||
|
||||
async function attemptCallback() {
|
||||
let response = await callback(token);
|
||||
|
||||
if (response.statusCode === 419) {
|
||||
await obtainToken();
|
||||
|
||||
return attemptCallback();
|
||||
} else {
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
if (token == null) {
|
||||
await obtainToken();
|
||||
}
|
||||
|
||||
return attemptCallback();
|
||||
};
|
||||
};
|
@ -0,0 +1,11 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = function matchOrFail(regex, string) {
|
||||
let match = regex.exec(string);
|
||||
|
||||
if (match != null) {
|
||||
return match.slice(1);
|
||||
} else {
|
||||
throw new Error(`Failed to match regex ${regex}`);
|
||||
}
|
||||
};
|
@ -0,0 +1,15 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = function normalizeString(string) {
|
||||
if (string == null) {
|
||||
return null;
|
||||
} else {
|
||||
let trimmed = string.trim();
|
||||
|
||||
if (trimmed.length === 0) {
|
||||
return null;
|
||||
} else {
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
};
|
@ -0,0 +1,14 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = function pickBestOption(a, b) {
|
||||
if (a == null || a.priority == null) {
|
||||
return b;
|
||||
} else if (b = null || b.priority == null) {
|
||||
return a;
|
||||
} else if (b.priority > a.priority) {
|
||||
return b;
|
||||
} else {
|
||||
// NOTE: We return the first (ie. original) one in the case of a 'tie'
|
||||
return a;
|
||||
}
|
||||
};
|
@ -1,18 +1,26 @@
|
||||
"use strict";
|
||||
|
||||
const htmlEntities = require("html-entities");
|
||||
const pickBestOption = require("../../shared/pick-best-option");
|
||||
|
||||
module.exports = function normalizeProduct() {
|
||||
return async function ({ data, createItem }) {
|
||||
let manufacturer = "STMicroelectronics";
|
||||
let modelName = data.cellData["XJE010_VT-007"];
|
||||
let description = htmlEntities.decode(data.cellData["XJE014_VT-007"]);
|
||||
let url = data.datasheetLink;
|
||||
let productID = data.productId;
|
||||
|
||||
createItem({
|
||||
id: `datasheet:st:${data.productId}`,
|
||||
data: {
|
||||
manufacturer: "STMicroelectronics",
|
||||
productID: data.productId,
|
||||
name: data.cellData["XJE010_VT-007"],
|
||||
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
|
||||
url: data.datasheetLink
|
||||
}
|
||||
id: `datasheet:${manufacturer}:${modelName}`,
|
||||
update: (data) => pickBestOption(data, {
|
||||
priority: 0.8,
|
||||
manufacturer: manufacturer,
|
||||
productID: productID,
|
||||
name: modelName,
|
||||
description: description,
|
||||
url: url
|
||||
})
|
||||
});
|
||||
};
|
||||
};
|
||||
|
Loading…
Reference in New Issue