Browse Source

Initial commit

master
Sven Slootweg 7 months ago
commit
6c66d7f070
  1. 3
      .eslintrc
  2. 1
      .gitignore
  3. 55
      index.js
  4. 8
      lib/shared/assure-response.js
  5. 12
      lib/shared/get-untagged-text.js
  6. 12
      lib/st/extract-id.js
  7. 28
      lib/st/task/find-categories.js
  8. 18
      lib/st/task/normalize-product.js
  9. 60
      lib/st/task/scrape-category.js
  10. 81
      lib/st/task/scrape-product.js
  11. 27
      notes.txt
  12. 19
      package.json
  13. 1110
      yarn.lock

3
.eslintrc

@ -0,0 +1,3 @@
{
"extends": "@joepie91/eslint-config"
}

1
.gitignore

@ -0,0 +1 @@
node_modules

55
index.js

@ -0,0 +1,55 @@
"use strict";
const bhttp = require("bhttp");
const stScrapeCategory = require("./lib/st/task/scrape-category");
const stFindCategories = require("./lib/st/task/find-categories");
const stScrapeProduct = require("./lib/st/task/scrape-product");
const stNormalizeProduct = require("./lib/st/task/normalize-product");
let session = bhttp.session({
headers: {
"user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"
}
});
let state = { session };
module.exports = {
database: {
host: "/run/postgresql",
database: "seekseek_documentation"
},
seed: [{
id: "st:home",
tags: [ "st:home" ],
data: {}
}],
tags: {
"st:home": [ "st:findCategories" ],
"st:category": [ "st:scrapeCategory" ],
"st:product": [ "st:scrapeProduct", "st:normalizeProduct" ]
},
tasks: {
"st:findCategories": {
ttl: "15d",
run: stFindCategories(state)
},
"st:scrapeCategory": {
ttl: "1d",
taskInterval: "60s",
version: "2",
run: stScrapeCategory(state)
},
"st:scrapeProduct": {
ttl: "15d",
taskInterval: "5s",
run: stScrapeProduct(state)
},
"st:normalizeProduct": {
dependsOn: [ "st:scrapeProduct" ],
version: "3",
run: stNormalizeProduct(state)
}
}
};

8
lib/shared/assure-response.js

@ -0,0 +1,8 @@
"use strict";
module.exports = function assureResponse(response) {
// FIXME: Add permittedStatusCodes to bhttp
if (response.statusCode !== 200) {
throw new Error(`Got non-200 status code: ${response.statusCode}`);
}
};

12
lib/shared/get-untagged-text.js

@ -0,0 +1,12 @@
/* eslint-disable indent */
"use strict";
module.exports = function getUntaggedText($element) {
return $element
.clone()
.children()
.remove()
.end()
.text()
.trim();
};

12
lib/st/extract-id.js

@ -0,0 +1,12 @@
"use strict";
module.exports = function extractID(string) {
// Quick-and-dirty category ID parsing from category pages
let match = /"prmisID":"([^"]+)"/.exec(string);
if (match != null) {
return match[1];
} else {
throw new Error(`ST: prmis ID expected but not found`);
}
};

28
lib/st/task/find-categories.js

@ -0,0 +1,28 @@
"use strict";
const cheerio = require("cheerio");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
module.exports = function findCategories({ session }) {
return async function ({ createItem }) {
let response = await session.get("https://www.st.com/content/st_com/en.html");
assureResponse(response);
let $ = cheerio.load(response.body);
let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)")
.toArray()
.map((element) => $(element).attr("href"))
.map((relativeURL) => url.resolve("https://www.st.com/", relativeURL));
for (let link of links) {
createItem({
id: `st:category:${link}`,
tags: [ "st:category" ],
data: { url: link }
});
}
};
};

18
lib/st/task/normalize-product.js

@ -0,0 +1,18 @@
"use strict";
const htmlEntities = require("html-entities");
module.exports = function normalizeProduct() {
return async function ({ data, createItem }) {
createItem({
id: `datasheet:st:${data.productId}`,
data: {
manufacturer: "STMicroelectronics",
productID: data.productId,
name: data.cellData["XJE010_VT-007"],
description: htmlEntities.decode(data.cellData["XJE014_VT-007"]),
url: data.datasheetLink
}
});
};
};

60
lib/st/task/scrape-category.js

@ -0,0 +1,60 @@
"use strict";
const assert = require("assert");
const syncpipe = require("syncpipe");
const assureResponse = require("../../shared/assure-response");
const extractId = require("../extract-id");
module.exports = function scrapeCategory({ session }) {
return async function({ data, createItem }) {
let response = await session.get(data.url);
assureResponse(response);
let prmisID = extractId(response.body.toString());
let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true });
assureResponse(response);
let listingBuffer = listingResponse.body;
if (listingBuffer.length > 0) {
// This is a category that has a product explorer
let listing = JSON.parse(listingBuffer.toString());
let cellNames = listing.columns.map((column) => {
let cellName = (column.identifier != null)
? `${column.identifier}_${column.qualifier_identifier}`
: `nonstandard:${column.name}:${column.qualifier}`
createItem({
id: `st:column:${cellName}`,
tags: [ "st:column" ],
data: column
});
return cellName;
});
for (let row of listing.rows) {
assert(row.productId != null);
let cellData = syncpipe(row.cells, [
(_) => _.map((cell, i) => [ cellNames[i], cell.value ]),
(_) => Object.fromEntries(_)
]);
createItem({
id: `st:product:${row.productId}`,
tags: [ "st:product" ],
data: {
... row,
cells: undefined,
cellData: cellData
}
});
}
} else {
console.warn("Warning: empty response, category does not have product explorer");
}
};
};

81
lib/st/task/scrape-product.js

@ -0,0 +1,81 @@
"use strict";
const cheerio = require("cheerio");
const url = require("url");
const assureResponse = require("../../shared/assure-response");
const getUntaggedText = require("../../shared/get-untagged-text");
module.exports = function scrapeProduct({ session }) {
return async function({ data, createItem, updateData, expireDependents }) {
if (data.productFolderUrl == null) {
throw new Error(`No known product page URL`);
}
let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl);
let response = await session.get(productPageURL);
assureResponse(response);
let $ = cheerio.load(response.body);
let datasheetLink = $("a[data-js='datasheetLink']").attr("href");
let datasheetURL = (datasheetLink != null)
? url.resolve(productPageURL, datasheetLink)
: null;
let resources = $(".st-table--resources")
.find("h3").toArray()
.map((heading) => {
let $heading = $(heading);
let sectionID = $heading.attr("id");
let sectionTitle = $heading.text().trim();
let $table = $heading.next("table");
let items = $table
.find("tbody tr").toArray()
.map((row) => {
let $row = $(row);
let $mainView = $row.find(".visible-on-desktop-only");
let $link = $mainView.find("a.st-link");
return {
url: url.resolve(productPageURL, $link.attr("href")),
documentID: $link.find("span.st-font--bold").text().trim(),
description: $link.find("span:not(.st-font--bold)").text().trim(),
version: getUntaggedText($link),
date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim()
};
});
return {
sectionID: sectionID,
sectionTitle: sectionTitle,
items: items
};
});
updateData((data) => {
return {
... data,
datasheetLink: datasheetURL,
resources: resources
};
});
expireDependents();
for (let section of resources) {
for (let resource of section.items) {
createItem({
id: `st:resource:${resource.url}`,
tags: (resource.url === datasheetURL)
? [ "st:resource", "st:datasheet" ]
: [ "st_resource" ],
data: { url: resource.url }
});
}
}
};
};

27
notes.txt

@ -0,0 +1,27 @@
let stFieldMapping = {
XJE010: "partNumber",
XJE014: "description",
STP716: "marketingStatus",
XJE017: "package",
STP1187: "grade",
STP034: "channelCount",
STP273: "channelCount",
XJE219: "operatingTemperature",
STP00930: "supplyCurrentPerChannel",
XJE417: "supplyVoltage",
XJG099: "inputOffsetVoltage",
XJG102: "inputBiasCurrent",
XJG110: "gainBandwidthProduct",
XJG108: "slewRate",
XJF556: "outputCurrent",
STP1041: "railToRailInput",
STP1042: "railToRailOutput",
XJG111: "inputEquivalentNoiseVoltage",
STP0878: "unitPriceUSD",
XJH198: "recommendedOutputCurrent",
STP915: "adjustableCurrentLimit",
XJG640: "rdsOn",
STP914: "enablePinActiveLevel",
RNP218: "esdHBMMinimum",
XJE418: "supplyCurrent"
};

19
package.json

@ -0,0 +1,19 @@
{
"name": "scrape-documentation",
"version": "1.0.0",
"main": "index.js",
"repository": "git@git.cryto.net:seekseek/scrape-documentation.git",
"author": "Sven Slootweg <admin@cryto.net>",
"license": "MIT",
"dependencies": {
"bhttp": "^1.2.8",
"bluebird": "^3.7.2",
"cheerio": "^1.0.0-rc.5",
"html-entities": "^2.1.1",
"syncpipe": "^1.0.0"
},
"devDependencies": {
"@joepie91/eslint-config": "^1.1.0",
"eslint": "^7.22.0"
}
}

1110
yarn.lock
File diff suppressed because it is too large
View File

Loading…
Cancel
Save