Initial commit
commit
6c66d7f070
@ -0,0 +1 @@
|
||||
node_modules
|
@ -0,0 +1,55 @@
|
||||
"use strict";
|
||||
|
||||
const bhttp = require("bhttp");
|
||||
|
||||
const stScrapeCategory = require("./lib/st/task/scrape-category");
|
||||
const stFindCategories = require("./lib/st/task/find-categories");
|
||||
const stScrapeProduct = require("./lib/st/task/scrape-product");
|
||||
const stNormalizeProduct = require("./lib/st/task/normalize-product");
|
||||
|
||||
let session = bhttp.session({
|
||||
headers: {
|
||||
"user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"
|
||||
}
|
||||
});
|
||||
|
||||
let state = { session };
|
||||
|
||||
module.exports = {
|
||||
database: {
|
||||
host: "/run/postgresql",
|
||||
database: "seekseek_documentation"
|
||||
},
|
||||
seed: [{
|
||||
id: "st:home",
|
||||
tags: [ "st:home" ],
|
||||
data: {}
|
||||
}],
|
||||
tags: {
|
||||
"st:home": [ "st:findCategories" ],
|
||||
"st:category": [ "st:scrapeCategory" ],
|
||||
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ]
|
||||
},
|
||||
tasks: {
|
||||
"st:findCategories": {
|
||||
ttl: "15d",
|
||||
run: stFindCategories(state)
|
||||
},
|
||||
"st:scrapeCategory": {
|
||||
ttl: "1d",
|
||||
taskInterval: "60s",
|
||||
version: "2",
|
||||
run: stScrapeCategory(state)
|
||||
},
|
||||
"st:scrapeProduct": {
|
||||
ttl: "15d",
|
||||
taskInterval: "5s",
|
||||
run: stScrapeProduct(state)
|
||||
},
|
||||
"st:normalizeProduct": {
|
||||
dependsOn: [ "st:scrapeProduct" ],
|
||||
version: "3",
|
||||
run: stNormalizeProduct(state)
|
||||
}
|
||||
}
|
||||
};
|
@ -0,0 +1,8 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = function assureResponse(response) {
|
||||
// FIXME: Add permittedStatusCodes to bhttp
|
||||
if (response.statusCode !== 200) {
|
||||
throw new Error(`Got non-200 status code: ${response.statusCode}`);
|
||||
}
|
||||
};
|
@ -0,0 +1,12 @@
|
||||
/* eslint-disable indent */
|
||||
"use strict";
|
||||
|
||||
module.exports = function getUntaggedText($element) {
|
||||
return $element
|
||||
.clone()
|
||||
.children()
|
||||
.remove()
|
||||
.end()
|
||||
.text()
|
||||
.trim();
|
||||
};
|
@ -0,0 +1,12 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = function extractID(string) {
|
||||
// Quick-and-dirty category ID parsing from category pages
|
||||
let match = /"prmisID":"([^"]+)"/.exec(string);
|
||||
|
||||
if (match != null) {
|
||||
return match[1];
|
||||
} else {
|
||||
throw new Error(`ST: prmis ID expected but not found`);
|
||||
}
|
||||
};
|
@ -0,0 +1,28 @@
|
||||
"use strict";
|
||||
|
||||
const cheerio = require("cheerio");
|
||||
const url = require("url");
|
||||
|
||||
const assureResponse = require("../../shared/assure-response");
|
||||
|
||||
module.exports = function findCategories({ session }) {
|
||||
return async function ({ createItem }) {
|
||||
let response = await session.get("https://www.st.com/content/st_com/en.html");
|
||||
assureResponse(response);
|
||||
|
||||
let $ = cheerio.load(response.body);
|
||||
|
||||
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
|
||||
.toArray()
|
||||
.map((element) => $(element).attr("href"))
|
||||
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
|
||||
|
||||
for (let link of links) {
|
||||
createItem({
|
||||
id: `st:category:${link}`,
|
||||
tags: [ "st:category" ],
|
||||
data: { url: link }
|
||||
});
|
||||
}
|
||||
};
|
||||
};
|
@ -0,0 +1,18 @@
|
||||
"use strict";
|
||||
|
||||
const htmlEntities = require("html-entities");
|
||||
|
||||
module.exports = function normalizeProduct() {
|
||||
return async function ({ data, createItem }) {
|
||||
createItem({
|
||||
id: `datasheet:st:${data.productId}`,
|
||||
data: {
|
||||
manufacturer: "STMicroelectronics",
|
||||
productID: data.productId,
|
||||
name: data.cellData["XJE010_VT-007"],
|
||||
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
|
||||
url: data.datasheetLink
|
||||
}
|
||||
});
|
||||
};
|
||||
};
|
@ -0,0 +1,60 @@
|
||||
"use strict";
|
||||
|
||||
const assert = require("assert");
|
||||
const syncpipe = require("syncpipe");
|
||||
|
||||
const assureResponse = require("../../shared/assure-response");
|
||||
const extractId = require("../extract-id");
|
||||
|
||||
module.exports = function scrapeCategory({ session }) {
|
||||
return async function({ data, createItem }) {
|
||||
let response = await session.get(data.url);
|
||||
assureResponse(response);
|
||||
|
||||
let prmisID = extractId(response.body.toString());
|
||||
|
||||
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
|
||||
assureResponse(response);
|
||||
let listingBuffer = listingResponse.body;
|
||||
|
||||
if (listingBuffer.length > 0) {
|
||||
// This is a category that has a product explorer
|
||||
let listing = JSON.parse(listingBuffer.toString());
|
||||
|
||||
let cellNames = listing.columns.map((column) => {
|
||||
let cellName = (column.identifier != null)
|
||||
? `${column.identifier}_${column.qualifier_identifier}`
|
||||
: `nonstandard:${column.name}:${column.qualifier}`
|
||||
|
||||
createItem({
|
||||
id: `st:column:${cellName}`,
|
||||
tags: [ "st:column" ],
|
||||
data: column
|
||||
});
|
||||
|
||||
return cellName;
|
||||
});
|
||||
|
||||
for (let row of listing.rows) {
|
||||
assert(row.productId != null);
|
||||
|
||||
let cellData = syncpipe(row.cells, [
|
||||
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
|
||||
(_) => Object.fromEntries(_)
|
||||
]);
|
||||
|
||||
createItem({
|
||||
id: `st:product:${row.productId}`,
|
||||
tags: [ "st:product" ],
|
||||
data: {
|
||||
... row,
|
||||
cells: undefined,
|
||||
cellData: cellData
|
||||
}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
console.warn("Warning: empty response, category does not have product explorer");
|
||||
}
|
||||
};
|
||||
};
|
@ -0,0 +1,81 @@
|
||||
"use strict";
|
||||
|
||||
const cheerio = require("cheerio");
|
||||
const url = require("url");
|
||||
|
||||
const assureResponse = require("../../shared/assure-response");
|
||||
const getUntaggedText = require("../../shared/get-untagged-text");
|
||||
|
||||
module.exports = function scrapeProduct({ session }) {
|
||||
return async function({ data, createItem, updateData, expireDependents }) {
|
||||
if (data.productFolderUrl == null) {
|
||||
throw new Error(`No known product page URL`);
|
||||
}
|
||||
|
||||
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
|
||||
|
||||
let response = await session.get(productPageURL);
|
||||
assureResponse(response);
|
||||
|
||||
let $ = cheerio.load(response.body);
|
||||
|
||||
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
|
||||
let datasheetURL = (datasheetLink != null)
|
||||
? url.resolve(productPageURL, datasheetLink)
|
||||
: null;
|
||||
|
||||
let resources = $(".st-table--resources")
|
||||
.find("h3").toArray()
|
||||
.map((heading) => {
|
||||
let $heading = $(heading);
|
||||
let sectionID = $heading.attr("id");
|
||||
let sectionTitle = $heading.text().trim();
|
||||
|
||||
let $table = $heading.next("table");
|
||||
|
||||
let items = $table
|
||||
.find("tbody tr").toArray()
|
||||
.map((row) => {
|
||||
let $row = $(row);
|
||||
let $mainView = $row.find(".visible-on-desktop-only");
|
||||
let $link = $mainView.find("a.st-link");
|
||||
|
||||
return {
|
||||
url: url.resolve(productPageURL, $link.attr("href")),
|
||||
documentID: $link.find("span.st-font--bold").text().trim(),
|
||||
description: $link.find("span:not(.st-font--bold)").text().trim(),
|
||||
version: getUntaggedText($link),
|
||||
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
sectionID: sectionID,
|
||||
sectionTitle: sectionTitle,
|
||||
items: items
|
||||
};
|
||||
});
|
||||
|
||||
updateData((data) => {
|
||||
return {
|
||||
... data,
|
||||
datasheetLink: datasheetURL,
|
||||
resources: resources
|
||||
};
|
||||
});
|
||||
|
||||
expireDependents();
|
||||
|
||||
for (let section of resources) {
|
||||
for (let resource of section.items) {
|
||||
createItem({
|
||||
id: `st:resource:${resource.url}`,
|
||||
tags: (resource.url === datasheetURL)
|
||||
? [ "st:resource", "st:datasheet" ]
|
||||
: [ "st_resource" ],
|
||||
data: { url: resource.url }
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
@ -0,0 +1,27 @@
|
||||
let stFieldMapping = {
|
||||
XJE010: "partNumber",
|
||||
XJE014: "description",
|
||||
STP716: "marketingStatus",
|
||||
XJE017: "package",
|
||||
STP1187: "grade",
|
||||
STP034: "channelCount",
|
||||
STP273: "channelCount",
|
||||
XJE219: "operatingTemperature",
|
||||
STP00930: "supplyCurrentPerChannel",
|
||||
XJE417: "supplyVoltage",
|
||||
XJG099: "inputOffsetVoltage",
|
||||
XJG102: "inputBiasCurrent",
|
||||
XJG110: "gainBandwidthProduct",
|
||||
XJG108: "slewRate",
|
||||
XJF556: "outputCurrent",
|
||||
STP1041: "railToRailInput",
|
||||
STP1042: "railToRailOutput",
|
||||
XJG111: "inputEquivalentNoiseVoltage",
|
||||
STP0878: "unitPriceUSD",
|
||||
XJH198: "recommendedOutputCurrent",
|
||||
STP915: "adjustableCurrentLimit",
|
||||
XJG640: "rdsOn",
|
||||
STP914: "enablePinActiveLevel",
|
||||
RNP218: "esdHBMMinimum",
|
||||
XJE418: "supplyCurrent"
|
||||
};
|
@ -0,0 +1,19 @@
|
||||
{
|
||||
"name": "scrape-documentation",
|
||||
"version": "1.0.0",
|
||||
"main": "index.js",
|
||||
"repository": "git@git.cryto.net:seekseek/scrape-documentation.git",
|
||||
"author": "Sven Slootweg <admin@cryto.net>",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"bhttp": "^1.2.8",
|
||||
"bluebird": "^3.7.2",
|
||||
"cheerio": "^1.0.0-rc.5",
|
||||
"html-entities": "^2.1.1",
|
||||
"syncpipe": "^1.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@joepie91/eslint-config": "^1.1.0",
|
||||
"eslint": "^7.22.0"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue