From d972746f35f89c458a3d5d48746e96995ee0245f Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Tue, 10 Jan 2023 19:56:02 +0100 Subject: [PATCH] First pass at refactoring for modularity --- .../farnell/task/normalize-product.js | 0 .../farnell/task/scrape-product.js | 0 .../farnell/task/scrape-sitemap.js | 0 .../mouser/task/scrape-sitemap.js | 2 +- index.js | 161 ++----------- lib/focus-lcds/task/find-categories.js | 30 --- lib/focus-lcds/task/normalize-product.js | 19 -- lib/focus-lcds/task/scrape-category.js | 33 --- lib/focus-lcds/task/scrape-product.js | 45 ---- lib/lcsc/task/find-categories.js | 42 ---- lib/lcsc/task/normalize-product.js | 19 -- lib/lcsc/task/scrape-category.js | 37 --- lib/merge-sources.js | 21 ++ lib/shared/surgeon-utils.js | 2 +- lib/sources/datasheets/focus-lcds.js | 137 ++++++++++++ lib/sources/datasheets/lcsc.js | 108 +++++++++ lib/sources/datasheets/st.js | 211 ++++++++++++++++++ lib/sources/datasheets/tme.js | 202 +++++++++++++++++ lib/st/extract-id.js | 12 - lib/st/task/find-categories.js | 28 --- lib/st/task/normalize-product.js | 20 -- lib/st/task/scrape-category.js | 60 ----- lib/st/task/scrape-product.js | 81 ------- lib/tme/task/normalize-product.js | 47 ---- lib/tme/task/scrape-product.js | 86 ------- lib/tme/task/scrape-sitemap.js | 50 ----- package.json | 3 +- test-import.js | 3 + todo.txt | 7 + yarn.lock | 69 +++++- 30 files changed, 769 insertions(+), 766 deletions(-) rename {lib => incomplete-or-broken}/farnell/task/normalize-product.js (100%) rename {lib => incomplete-or-broken}/farnell/task/scrape-product.js (100%) rename {lib => incomplete-or-broken}/farnell/task/scrape-sitemap.js (100%) rename {lib => incomplete-or-broken}/mouser/task/scrape-sitemap.js (95%) delete mode 100644 lib/focus-lcds/task/find-categories.js delete mode 100644 lib/focus-lcds/task/normalize-product.js delete mode 100644 lib/focus-lcds/task/scrape-category.js delete mode 100644 lib/focus-lcds/task/scrape-product.js delete mode 100644 lib/lcsc/task/find-categories.js delete mode 100644 lib/lcsc/task/normalize-product.js delete mode 100644 lib/lcsc/task/scrape-category.js create mode 100644 lib/merge-sources.js create mode 100644 lib/sources/datasheets/focus-lcds.js create mode 100644 lib/sources/datasheets/lcsc.js create mode 100644 lib/sources/datasheets/st.js create mode 100644 lib/sources/datasheets/tme.js delete mode 100644 lib/st/extract-id.js delete mode 100644 lib/st/task/find-categories.js delete mode 100644 lib/st/task/normalize-product.js delete mode 100644 lib/st/task/scrape-category.js delete mode 100644 lib/st/task/scrape-product.js delete mode 100644 lib/tme/task/normalize-product.js delete mode 100644 lib/tme/task/scrape-product.js delete mode 100644 lib/tme/task/scrape-sitemap.js create mode 100644 test-import.js create mode 100644 todo.txt diff --git a/lib/farnell/task/normalize-product.js b/incomplete-or-broken/farnell/task/normalize-product.js similarity index 100% rename from lib/farnell/task/normalize-product.js rename to incomplete-or-broken/farnell/task/normalize-product.js diff --git a/lib/farnell/task/scrape-product.js b/incomplete-or-broken/farnell/task/scrape-product.js similarity index 100% rename from lib/farnell/task/scrape-product.js rename to incomplete-or-broken/farnell/task/scrape-product.js diff --git a/lib/farnell/task/scrape-sitemap.js b/incomplete-or-broken/farnell/task/scrape-sitemap.js similarity index 100% rename from lib/farnell/task/scrape-sitemap.js rename to incomplete-or-broken/farnell/task/scrape-sitemap.js diff --git a/lib/mouser/task/scrape-sitemap.js b/incomplete-or-broken/mouser/task/scrape-sitemap.js similarity index 95% rename from lib/mouser/task/scrape-sitemap.js rename to incomplete-or-broken/mouser/task/scrape-sitemap.js index 18fcce8..fe1bd5a 100644 --- a/lib/mouser/task/scrape-sitemap.js +++ b/incomplete-or-broken/mouser/task/scrape-sitemap.js @@ -10,7 +10,7 @@ const simpleSink = require("@promistream/simple-sink"); const parseSitemap = require("@promistream/parse-sitemap"); const decodeString = require("@promistream/decode-string"); -const assureResponse = require("../../shared/assure-response"); +const assureResponse = require("../../../lib/shared/assure-response"); let session = bhttp.session({ headers: { diff --git a/index.js b/index.js index 957ad87..fdba56b 100644 --- a/index.js +++ b/index.js @@ -2,6 +2,7 @@ const bhttp = require("bhttp"); const got = require("got"); +const mergeSources = require("./lib/merge-sources"); const assureResponse = require("./lib/shared/assure-response"); @@ -29,7 +30,7 @@ let state = { }) }; -module.exports = { +let baseSchema = { backend: "postgresql", database: { host: "/run/postgresql", @@ -38,151 +39,15 @@ module.exports = { max: 75 } }, - seed: [{ - id: "st:home", - tags: [ "st:home" ], - data: {} - }, { - id: "lcsc:home", - tags: [ "lcsc:home" ], - data: {} - }, { - id: "mouser:sitemap:index", - tags: [ "mouser:sitemap" ], - data: { url: "https://www.mouser.com/indexgzipwww.xml" } - }, { - id: "tme:sitemap:index", - tags: [ "tme:sitemap" ], - data: { url: "https://www.tme.eu/en/sitemap.xml" } - // TODO: Delete derived sitemap entries - }, { - id: "farnell:sitemap:index", - tags: [ "farnell:sitemap" ], - data: { url: "https://uk.farnell.com/sitemap.xml" } - // TODO: Delete derived sitemap entries - }, { - id: "focus-lcds:home", - tags: [ "focus-lcds:home" ], - data: {} - }], - tags: { - "st:home": [ "st:findCategories" ], - "st:category": [ "st:scrapeCategory" ], - "st:product": [ "st:scrapeProduct", "st:normalizeProduct" ], - "lcsc:home": [ "lcsc:findCategories" ], - "lcsc:category": [ "lcsc:scrapeCategory" ], - "lcsc:product": [ "lcsc:normalizeProduct" ], - "tme:sitemap": [ "tme:scrapeSitemap" ], - "tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ], - // "farnell:sitemap": [ "farnell:scrapeSitemap" ], - // "farnell:product": [ "farnell:scrapeProduct", "farnell:normalizeProduct" ], - "focus-lcds:home": [ "focus-lcds:findCategories" ], - "focus-lcds:category": [ "focus-lcds:scrapeCategory" ], - "focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ], - }, - tasks: { - // ST Microelectronics - "st:findCategories": { - ttl: "15d", - run: require("./lib/st/task/find-categories")(state) - }, - "st:scrapeCategory": { - ttl: "1d", - taskInterval: "60s", - version: "2", - run: require("./lib/st/task/scrape-category")(state) - }, - "st:scrapeProduct": { - ttl: "15d", - taskInterval: "5s", - run: require("./lib/st/task/scrape-product")(state) - }, - "st:normalizeProduct": { - dependsOn: [ "st:scrapeProduct" ], - version: "8", - parallelTasks: 50, - run: require("./lib/st/task/normalize-product")(state) - }, - - // LCSC - // FIXME: Commenting out a bunch of tasks but not removing them from the tag assignments will result in an error, but will *not* exit the program. That's probably not right? - "lcsc:findCategories": { - ttl: "30d", - version: "1", - run: require("./lib/lcsc/task/find-categories")(state) - }, - "lcsc:scrapeCategory": { - ttl: "30d", - taskInterval: "1m", - run: require("./lib/lcsc/task/scrape-category")(state) - }, - "lcsc:normalizeProduct": { - version: "7", - parallelTasks: 50, - run: require("./lib/lcsc/task/normalize-product")(state) - }, - - // Mouser - "mouser:scrapeSitemap": { - taskInterval: "30s", - run: require("./lib/mouser/task/scrape-sitemap")(state) - }, - - // TME.eu - "tme:scrapeSitemap": { - ttl: "3d", - taskInterval: "30s", - run: require("./lib/tme/task/scrape-sitemap")(state) - }, - "tme:scrapeProduct": { - ttl: "60d", - taskInterval: "500ms", - run: require("./lib/tme/task/scrape-product")(state) - }, - "tme:normalizeProduct": { - dependsOn: [ "tme:scrapeProduct" ], - version: "5", - parallelTasks: 50, - run: require("./lib/tme/task/normalize-product")(state) - }, - - // Farnell - // "farnell:scrapeSitemap": { - // ttl: "3d", - // taskInterval: "30s", - // run: require("./lib/farnell/task/scrape-sitemap")(state) - // }, - // "farnell:scrapeProduct": { - // ttl: "60d", - // taskInterval: "500ms", - // run: require("./lib/farnell/task/scrape-product")(state) - // }, - // "farnell:normalizeProduct": { - // dependsOn: [ "farnell:scrapeProduct" ], - // version: "1", - // parallelTasks: 50, - // run: require("./lib/farnell/task/normalize-product")(state) - // }, - - // Focus LCDs - "focus-lcds:findCategories": { - ttl: "60d", - run: require("./lib/focus-lcds/task/find-categories")(state) - }, - "focus-lcds:scrapeCategory": { - ttl: "15d", - taskInterval: "1m", - run: require("./lib/focus-lcds/task/scrape-category")(state) - }, - "focus-lcds:scrapeProduct": { - ttl: "15d", - taskInterval: "5s", - run: require("./lib/focus-lcds/task/scrape-product")(state) - }, - "focus-lcds:normalizeProduct": { - dependsOn: [ "focus-lcds:scrapeProduct" ], - parallelTasks: 50, - run: require("./lib/focus-lcds/task/normalize-product")(state) - }, - } + seed: [], + tags: {}, + tasks: {} }; + +// NOTE: This is *not* currently a fully modular system! Identifiers (tags, task IDs, etc.) are still global to the srap instance as a whole, even though the code exists in different modules. Prefixing identifiers with the scraper they originate from, is still necessary! +module.exports = mergeSources(baseSchema, [ + require("./lib/sources/datasheets/lcsc")(state), + require("./lib/sources/datasheets/tme")(state), + require("./lib/sources/datasheets/st")(state), + require("./lib/sources/datasheets/focus-lcds")(state), +]); diff --git a/lib/focus-lcds/task/find-categories.js b/lib/focus-lcds/task/find-categories.js deleted file mode 100644 index 9d8dd1f..0000000 --- a/lib/focus-lcds/task/find-categories.js +++ /dev/null @@ -1,30 +0,0 @@ -"use strict"; - -const syncpipe = require("syncpipe"); -const url = require("url"); - -const assureResponse = require("../../shared/assure-response"); -const surgeon = require("../../shared/surgeon-utils"); -const uniqueArray = require("../../shared/unique-array"); - -module.exports = function findCategories({ session }) { - return async function({ createItem }) { - let response = await session.get("https://focuslcds.com/"); - assureResponse(response); - - let urls = syncpipe(null, [ - _ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()), - _ => uniqueArray(_), - _ => _.filter((relativeURL) => relativeURL !== ""), - _ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL)) - ]); - - for (let url of urls) { - createItem({ - id: `focus-lcds:category:${url}`, - tags: [ "focus-lcds:category" ], - data: { url: url } - }); - } - }; -}; diff --git a/lib/focus-lcds/task/normalize-product.js b/lib/focus-lcds/task/normalize-product.js deleted file mode 100644 index 11d28c0..0000000 --- a/lib/focus-lcds/task/normalize-product.js +++ /dev/null @@ -1,19 +0,0 @@ -"use strict"; - -const createDatasheet = require("../../shared/create-datasheet"); - -module.exports = function normalizeProduct() { - return async function (api) { - let { data } = api; - - createDatasheet(api, { - priority: 0.8, - source: "focus-lcds", - manufacturer: data.itemData.manufacturer ?? "Focus LCDs", - productID: null, - name: data.itemData.name, - description: data.itemData.description, - url: data.itemData.datasheetURL - }); - }; -}; diff --git a/lib/focus-lcds/task/scrape-category.js b/lib/focus-lcds/task/scrape-category.js deleted file mode 100644 index 97bfd2e..0000000 --- a/lib/focus-lcds/task/scrape-category.js +++ /dev/null @@ -1,33 +0,0 @@ -"use strict"; - -const assureResponse = require("../../shared/assure-response"); -const surgeon = require("../../shared/surgeon-utils"); - -module.exports = function scrapeCategory({ session }) { - return async function({ data, createItem }) { - let response = await session.get(data.url); - assureResponse(response); - - let body = response.body.toString(); - - let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body); - - if (nextPageURL != null) { - createItem({ - id: `focus-lcds:category:${nextPageURL}`, - tags: [ "focus-lcds:category" ], - data: { url: nextPageURL } - }); - } - - let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body); - - for (let url of items) { - createItem({ - id: `focus-lcds:product:${url}`, - tags: [ "focus-lcds:product" ], - data: { url: url } - }); - } - }; -}; diff --git a/lib/focus-lcds/task/scrape-product.js b/lib/focus-lcds/task/scrape-product.js deleted file mode 100644 index b403369..0000000 --- a/lib/focus-lcds/task/scrape-product.js +++ /dev/null @@ -1,45 +0,0 @@ -"use strict"; - -const url = require("url"); - -const assureResponse = require("../../shared/assure-response"); -const surgeon = require("../../shared/surgeon-utils"); -const extractModelNumber = require("../../shared/extract-model-number"); - -module.exports = function scrapeProduct({ session }) { - return async function({ data, updateData, expireDependents }) { - let response = await session.get(data.url); - assureResponse(response); - - let body = response.body.toString(); - - let itemData = surgeon({ - name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ], - manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`, - description: [ - `selectMaybeOne "meta[name='description']" | readAttr content`, - // Get rid of the keyword spam... - (description) => description.split(",")[0] - ], - image: `selectMaybeOne "meta[property='og:image']" | readAttr content`, - price: `selectMaybeOne .productView-price .price--withoutTax | text`, - datasheetURL: [ - `selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`, - (relativeURL) => (!/^\/content\/?$/.test(relativeURL)) - ? url.resolve("https://focuslcds.com/", relativeURL) - : null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product - ], - technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, { - name: `text`, - value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text` - }] - }, body); - - updateData((oldData) => ({ - ... oldData, - itemData: itemData - })); - - expireDependents(); - }; -}; diff --git a/lib/lcsc/task/find-categories.js b/lib/lcsc/task/find-categories.js deleted file mode 100644 index 2844ab8..0000000 --- a/lib/lcsc/task/find-categories.js +++ /dev/null @@ -1,42 +0,0 @@ -"use strict"; - -const assert = require("assert"); - -const assureResponse = require("../../shared/assure-response"); - -module.exports = function lcscFindCategories(state) { - let { session } = state; - - return async function ({ createItem }) { - let response = await session.get("https://wwwapi.lcsc.com/v1/home/category"); - - assureResponse(response); - assert(response.body.length > 0); - assert(response.statusCode === 200); - - function processCategoryEntries(categories) { - for (let category of categories) { - let productCount = category.productNum; - let pageCount = Math.ceil(productCount / 500); - - // NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available. - for (let i = 1; i <= pageCount; i++) { - createItem({ - id: `lcsc:category:${category.catalogId}:page-${i}`, - tags: [ "lcsc:category" ], - data: { - ... category, - pageNumber: i - } - }); - } - - if (category.childCatelogs != null) { - processCategoryEntries(category.childCatelogs); - } - } - } - - processCategoryEntries(response.body); - }; -}; diff --git a/lib/lcsc/task/normalize-product.js b/lib/lcsc/task/normalize-product.js deleted file mode 100644 index ac5db0d..0000000 --- a/lib/lcsc/task/normalize-product.js +++ /dev/null @@ -1,19 +0,0 @@ -"use strict"; - -const createDatasheet = require("../../shared/create-datasheet"); - -module.exports = function lcscNormalizeProduct() { - return async function (api) { - let { data } = api; - - createDatasheet(api, { - priority: 0.4, - source: "lcsc", - manufacturer: data.brandNameEn, - productID: data.productCode, - name: data.productModel, - description: data.productIntroEn, - url: data.pdfUrl - }); - }; -}; diff --git a/lib/lcsc/task/scrape-category.js b/lib/lcsc/task/scrape-category.js deleted file mode 100644 index 0adafda..0000000 --- a/lib/lcsc/task/scrape-category.js +++ /dev/null @@ -1,37 +0,0 @@ -"use strict"; - -const assert = require("assert"); - -const assureResponse = require("../../shared/assure-response"); - -// TODO: Validate response formats with validatem instead - -module.exports = function lcscScrapeCategory(state) { - let { session } = state; - - return async function ({ data, createItem, deleteItem, updateData }) { - let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, { - catalogIdList: [ data.catalogId ], - currentPage: data.pageNumber, - pageSize: 500, - paramNameValueMap: {} - }); - - assureResponse(response); - assert(response.statusCode === 200); - assert(response.body.productList != null); // Missing from stale queued requests? - assert(response.body.productList.length > 0); - - for (let item of response.body.productList) { - createItem({ - // NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead - id: `lcsc:product:${item.productCode}`, - tags: [ "lcsc:product" ], - data: item - }); - } - - // We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway. - deleteItem(); - }; -}; diff --git a/lib/merge-sources.js b/lib/merge-sources.js new file mode 100644 index 0000000..78cf67b --- /dev/null +++ b/lib/merge-sources.js @@ -0,0 +1,21 @@ +"use strict"; + +const mergeByTemplate = require("merge-by-template"); + +function noOverride(a, b) { + if (a != null && b != null) { + throw new Error(`Property cannot be overridden`); + } +} + +let merge = mergeByTemplate.createMerger({ + backend: noOverride, + database: noOverride, + seed: [], + tags: mergeByTemplate.anyProperty([]), + tasks: {} +}); + +module.exports = function mergeSources(base, sources) { + return merge([ base, ... sources ]); +}; diff --git a/lib/shared/surgeon-utils.js b/lib/shared/surgeon-utils.js index d4b6d5c..e48d0dd 100644 --- a/lib/shared/surgeon-utils.js +++ b/lib/shared/surgeon-utils.js @@ -267,4 +267,4 @@ module.exports = surgeon.default({ return url.resolve(base, input); } } -});; +}); diff --git a/lib/sources/datasheets/focus-lcds.js b/lib/sources/datasheets/focus-lcds.js new file mode 100644 index 0000000..ea8b83d --- /dev/null +++ b/lib/sources/datasheets/focus-lcds.js @@ -0,0 +1,137 @@ +"use strict"; + +const syncpipe = require("syncpipe"); +const url = require("url"); + +const assureResponse = require("../../shared/assure-response"); +const surgeon = require("../../shared/surgeon-utils"); +const uniqueArray = require("../../shared/unique-array"); +const extractModelNumber = require("../../shared/extract-model-number"); +const createDatasheet = require("../../shared/create-datasheet"); + +// Focus LCDs + +module.exports = function ({ session }) { + return { + seed: [{ + id: "focus-lcds:home", + tags: [ "focus-lcds:home" ], + data: {} + }], + tags: { + "focus-lcds:home": [ "focus-lcds:findCategories" ], + "focus-lcds:category": [ "focus-lcds:scrapeCategory" ], + "focus-lcds:product": [ "focus-lcds:scrapeProduct", "focus-lcds:normalizeProduct" ], + }, + tasks: { + "focus-lcds:findCategories": { + ttl: "60d", + run: async function({ createItem }) { + let response = await session.get("https://focuslcds.com/"); + assureResponse(response); + + let urls = syncpipe(null, [ + _ => surgeon(`selectMany ".category-list a" | readAttr href`, response.body.toString()), + _ => uniqueArray(_), + _ => _.filter((relativeURL) => relativeURL !== ""), + _ => _.map((relativeURL) => url.resolve("https://focuslcds.com/", relativeURL)) + ]); + + for (let url of urls) { + createItem({ + id: `focus-lcds:category:${url}`, + tags: [ "focus-lcds:category" ], + data: { url: url } + }); + } + } + }, + "focus-lcds:scrapeCategory": { + ttl: "15d", + taskInterval: "1m", + run: async function({ data, createItem }) { + let response = await session.get(data.url); + assureResponse(response); + + let body = response.body.toString(); + + let nextPageURL = surgeon(`selectMaybeN ".pagination-item--next a" 0 | readAttr href`, body); + + if (nextPageURL != null) { + createItem({ + id: `focus-lcds:category:${nextPageURL}`, + tags: [ "focus-lcds:category" ], + data: { url: nextPageURL } + }); + } + + let items = surgeon(`selectMany "ul.productList .listItem-title a" | readAttr href`, body); + + for (let url of items) { + createItem({ + id: `focus-lcds:product:${url}`, + tags: [ "focus-lcds:product" ], + data: { url: url } + }); + } + } + }, + "focus-lcds:scrapeProduct": { + ttl: "15d", + taskInterval: "5s", + run: async function({ data, updateData, expireDependents }) { + let response = await session.get(data.url); + assureResponse(response); + + let body = response.body.toString(); + + let itemData = surgeon({ + name: [ `selectOne "meta[property='og:title']" | readAttr content`, extractModelNumber ], + manufacturer: `selectMaybeOne "[itemprop='brand']" | text | ignoreEmptyString`, + description: [ + `selectMaybeOne "meta[name='description']" | readAttr content`, + // Get rid of the keyword spam... + (description) => description.split(",")[0] + ], + image: `selectMaybeOne "meta[property='og:image']" | readAttr content`, + price: `selectMaybeOne .productView-price .price--withoutTax | text`, + datasheetURL: [ + `selectMaybeOne ".productView-info a[href^='/content/']" | readAttr href`, + (relativeURL) => (!/^\/content\/?$/.test(relativeURL)) + ? url.resolve("https://focuslcds.com/", relativeURL) + : null // Ignore when the datasheet URL is *just* /content/, as that means there is no datasheet for this product + ], + technicalSpecs: [ `selectAny "#tab-description .productView-info-name"`, { + name: `text`, + value: `nextUntil ".productView-info-name" ".productView-info-value" | index 0 | text` + }] + }, body); + + updateData((oldData) => ({ + ... oldData, + itemData: itemData + })); + + expireDependents(); + } + }, + "focus-lcds:normalizeProduct": { + dependsOn: [ "focus-lcds:scrapeProduct" ], + parallelTasks: 50, + run: async function (api) { + let { data } = api; + + createDatasheet(api, { + priority: 0.8, + source: "focus-lcds", + manufacturer: data.itemData.manufacturer ?? "Focus LCDs", + productID: null, + name: data.itemData.name, + description: data.itemData.description, + url: data.itemData.datasheetURL + }); + } + }, + } + }; +}; diff --git a/lib/sources/datasheets/lcsc.js b/lib/sources/datasheets/lcsc.js new file mode 100644 index 0000000..3501e31 --- /dev/null +++ b/lib/sources/datasheets/lcsc.js @@ -0,0 +1,108 @@ +"use strict"; + +const assert = require("assert"); + +const assureResponse = require("../../shared/assure-response"); +const createDatasheet = require("../../shared/create-datasheet"); + +// LCSC +// TODO: Validate response formats with validatem instead + +module.exports = function ({ session }) { + return { + seed: [{ + id: "lcsc:home", + tags: [ "lcsc:home" ], + data: {} + }], + tags: { + "lcsc:home": [ "lcsc:findCategories" ], + "lcsc:category": [ "lcsc:scrapeCategory" ], + "lcsc:product": [ "lcsc:normalizeProduct" ], + }, + tasks: { + "lcsc:findCategories": { + ttl: "30d", + version: "1", + run: async function ({ storeItem }) { + let response = await session.get("https://wwwapi.lcsc.com/v1/home/category"); + + assureResponse(response); + assert(response.body.length > 0); + assert(response.statusCode === 200); + + function processCategoryEntries(categories) { + for (let category of categories) { + let productCount = category.productNum; + let pageCount = Math.ceil(productCount / 500); + + // NOTE: We do *not* use the page count indicated by the API, but instead calculate it ourself from the product count. This is because the API-specified page count will cap out at the equivalent of 10k items, even when more pages than that are actually available. + for (let i = 1; i <= pageCount; i++) { + storeItem({ + id: `lcsc:category:${category.catalogId}:page-${i}`, + tags: [ "lcsc:category" ], + data: { + ... category, + pageNumber: i + } + }); + } + + if (category.childCatelogs != null) { + processCategoryEntries(category.childCatelogs); + } + } + } + + processCategoryEntries(response.body); + } + }, + "lcsc:scrapeCategory": { + ttl: "30d", + taskInterval: "1m", + run: async function ({ data, storeItem, deleteItem }) { + let response = await session.post(`https://wwwapi.lcsc.com/v1/products/list`, { + catalogIdList: [ data.catalogId ], + currentPage: data.pageNumber, + pageSize: 500, + paramNameValueMap: {} + }); + + assureResponse(response); + assert(response.statusCode === 200); + assert(response.body.productList != null); // Missing from stale queued requests? + assert(response.body.productList.length > 0); + + for (let item of response.body.productList) { + storeItem({ + // NOTE: item.productId seems like the database ID on the website, but item.productCode is the actual LCSC part number used internally for inventory management, so we use that for identification instead + id: `lcsc:product:${item.productCode}`, + tags: [ "lcsc:product" ], + data: item + }); + } + + // We don't keep around page items, because the amount of pages for a category can change, and so this isn't a stable identifier. They'll be recreated on the next category scrape anyway. + deleteItem(); + } + }, + "lcsc:normalizeProduct": { + version: "7", + parallelTasks: 50, + run: async function (api) { + let { data } = api; + + createDatasheet(api, { + priority: 0.4, + source: "lcsc", + manufacturer: data.brandNameEn, + productID: data.productCode, + name: data.productModel, + description: data.productIntroEn, + url: data.pdfUrl + }); + } + }, + } + }; +}; diff --git a/lib/sources/datasheets/st.js b/lib/sources/datasheets/st.js new file mode 100644 index 0000000..1490790 --- /dev/null +++ b/lib/sources/datasheets/st.js @@ -0,0 +1,211 @@ +"use strict"; + +const cheerio = require("cheerio"); +const url = require("url"); +const assert = require("assert"); +const syncpipe = require("syncpipe"); +const htmlEntities = require("html-entities"); + +const createDatasheet = require("../../shared/create-datasheet"); +const assureResponse = require("../../shared/assure-response"); +const getUntaggedText = require("../../shared/get-untagged-text"); + +// ST Microelectronics + +function extractID(string) { + // Quick-and-dirty category ID parsing from category pages + let match = /"prmisID":"([^"]+)"/.exec(string); + + if (match != null) { + return match[1]; + } else { + throw new Error(`ST: prmis ID expected but not found`); + } +} + +module.exports = function ({ session }) { + return { + seed: [{ + id: "st:home", + tags: [ "st:home" ], + data: {} + }], + tags: { + "st:home": [ "st:findCategories" ], + "st:category": [ "st:scrapeCategory" ], + "st:product": [ "st:scrapeProduct", "st:normalizeProduct" ], + }, + tasks: { + "st:findCategories": { + ttl: "15d", + run: async function ({ createItem }) { + let response = await session.get("https://www.st.com/content/st_com/en.html"); + assureResponse(response); + + let $ = cheerio.load(response.body); + + let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)") + .toArray() + .map((element) => $(element).attr("href")) + .map((relativeURL) => url.resolve("https://www.st.com/", relativeURL)); + + for (let link of links) { + createItem({ + id: `st:category:${link}`, + tags: [ "st:category" ], + data: { url: link } + }); + } + } + }, + "st:scrapeCategory": { + ttl: "1d", + taskInterval: "60s", + version: "2", + run: async function({ data, createItem }) { + let response = await session.get(data.url); + assureResponse(response); + + let prmisID = extractID(response.body.toString()); + + let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true }); + assureResponse(response); + let listingBuffer = listingResponse.body; + + if (listingBuffer.length > 0) { + // This is a category that has a product explorer + let listing = JSON.parse(listingBuffer.toString()); + + let cellNames = listing.columns.map((column) => { + let cellName = (column.identifier != null) + ? `${column.identifier}_${column.qualifier_identifier}` + : `nonstandard:${column.name}:${column.qualifier}` + + createItem({ + id: `st:column:${cellName}`, + tags: [ "st:column" ], + data: column + }); + + return cellName; + }); + + for (let row of listing.rows) { + assert(row.productId != null); + + let cellData = syncpipe(row.cells, [ + (_) => _.map((cell, i) => [ cellNames[i], cell.value ]), + (_) => Object.fromEntries(_) + ]); + + createItem({ + id: `st:product:${row.productId}`, + tags: [ "st:product" ], + data: { + ... row, + cells: undefined, + cellData: cellData + } + }); + } + } else { + console.warn("Warning: empty response, category does not have product explorer"); + } + } + }, + "st:scrapeProduct": { + ttl: "15d", + taskInterval: "5s", + run: async function({ data, createItem, updateData, expireDependents }) { + if (data.productFolderUrl == null) { + throw new Error(`No known product page URL`); + } + + let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl); + + let response = await session.get(productPageURL); + assureResponse(response); + + let $ = cheerio.load(response.body); + + let datasheetLink = $("a[data-js='datasheetLink']").attr("href"); + let datasheetURL = (datasheetLink != null) + ? url.resolve(productPageURL, datasheetLink) + : null; + + let resources = $(".st-table--resources") + .find("h3").toArray() + .map((heading) => { + let $heading = $(heading); + let sectionID = $heading.attr("id"); + let sectionTitle = $heading.text().trim(); + + let $table = $heading.next("table"); + + let items = $table + .find("tbody tr").toArray() + .map((row) => { + let $row = $(row); + let $mainView = $row.find(".visible-on-desktop-only"); + let $link = $mainView.find("a.st-link"); + + return { + url: url.resolve(productPageURL, $link.attr("href")), + documentID: $link.find("span.st-font--bold").text().trim(), + description: $link.find("span:not(.st-font--bold)").text().trim(), + version: getUntaggedText($link), + date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim() + }; + }); + + return { + sectionID: sectionID, + sectionTitle: sectionTitle, + items: items + }; + }); + + updateData((data) => { + return { + ... data, + datasheetLink: datasheetURL, + resources: resources + }; + }); + + expireDependents(); + + for (let section of resources) { + for (let resource of section.items) { + createItem({ + id: `st:resource:${resource.url}`, + tags: (resource.url === datasheetURL) + ? [ "st:resource", "st:datasheet" ] + : [ "st_resource" ], + data: { url: resource.url } + }); + } + } + } + }, + "st:normalizeProduct": { + dependsOn: [ "st:scrapeProduct" ], + version: "8", + parallelTasks: 50, + run: async function (api) { + let { data } = api; + + createDatasheet(api, { + priority: 0.8, + source: "st", + manufacturer: "STMicroelectronics", + productID: data.productId, + name: data.cellData["XJE010_VT-007"], + description: htmlEntities.decode(data.cellData["XJE014_VT-007"]), + url: data.datasheetLink + }); + } + }, + } + }; +}; diff --git a/lib/sources/datasheets/tme.js b/lib/sources/datasheets/tme.js new file mode 100644 index 0000000..0fd1ec5 --- /dev/null +++ b/lib/sources/datasheets/tme.js @@ -0,0 +1,202 @@ +"use strict"; + +const assert = require("assert"); +const cheerio = require("cheerio"); +const syncpipe = require("syncpipe"); +const url = require("url"); + +const pipe = require("@promistream/pipe"); +const simpleSink = require("@promistream/simple-sink"); +const assureResponse = require("../../shared/assure-response"); +const parseSitemapResponse = require("../../shared/parse-sitemap-response"); +const createDatasheet = require("../../shared/create-datasheet"); + +// TME.eu + +function firstMatch(options) { + for (let option of options) { + if (option != null && option !== "") { + return option; + } + } +} + +module.exports = function ({ session }) { + return { + seed: [{ + id: "tme:sitemap:index", + tags: [ "tme:sitemap" ], + data: { url: "https://www.tme.eu/en/sitemap.xml" } + // TODO: Delete derived sitemap entries + }], + tags: { + "tme:sitemap": [ "tme:scrapeSitemap" ], + "tme:product": [ "tme:scrapeProduct", "tme:normalizeProduct" ], + }, + tasks: { + "tme:scrapeSitemap": { + ttl: "3d", + taskInterval: "30s", + run: async function ({ data, createItem, deleteItem }) { + let response = await session.get(data.url, { stream: true }); + assureResponse(response); + + let resultCount = 0; + + await pipe([ + parseSitemapResponse(response), + simpleSink((item) => { + assert(item.url); + + if (item.type === "sitemap") { + // NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc. + if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) { + createItem({ + id: `tme:sitemap:${item.url}`, + tags: [ "tme:sitemap" ], + data: { url: item.url } + }); + + resultCount += 1; + } + } else if (item.type === "url") { + if (item.url.startsWith("https://www.tme.eu/en/details/")) { + createItem({ + id: `tme:product:${item.url}`, + tags: [ "tme:product" ], + data: { url: item.url } + }); + + resultCount += 1; + } + } + }) + ]).read(); + + // If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything. + assert(resultCount > 0); + + // FIXME: Do this here? Or is there a reason *not* to delete the sitemap entry? + // deleteItem(); + } + }, + "tme:scrapeProduct": { + ttl: "60d", + taskInterval: "500ms", + run: async function ({ data, createAlias, updateData, expireDependents }) { + let response = await session.get(data.url); + assureResponse(response); + + let $ = cheerio.load(response.body); + + // FIXME: This is currently broken! + let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [ + (_) => _.map((header) => { + let $header = $(header); + + return [ + $header.find(".name").text().trim(), + $header.find(".value").text().trim() + ]; + }), + (_) => Object.fromEntries(_) + ]); + + let descriptionElement = $(".c-pip__description > h2").eq(0); + + let itemData = { + productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), + manufacturer: $("a.pip__product-header-title").eq(0).text().trim(), + model: firstMatch([ + $("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number + $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol + ]), + description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element + ? descriptionElement.text().trim() + : null, + documents: $("div.c-pip__document > a").toArray() + .map((link) => { + let relativeLink = $(link).attr("href"); + + if (relativeLink != null) { + return { + description: $(link).text().trim(), + url: url.resolve( + data.url, + relativeLink + ) + }; + } else { + // Probably a video popup + return null; + } + }) + .filter((item) => { + return item != null; + }), + // TODO: Scrape prices + }; + + assert(itemData.productID != null); + assert(itemData.manufacturer != null); + assert(itemData.model != null); + + createAlias({ from: `tme:product:${itemData.productID}` }); + + updateData((oldData) => ({ + ... oldData, + itemData: itemData + })); + + expireDependents(); + } + }, + "tme:normalizeProduct": { + dependsOn: [ "tme:scrapeProduct" ], + version: "5", + parallelTasks: 50, + run: async function (api) { + let { data } = api; + + function isEnglish(document) { + return /\sen\s*$/i.test(document.description); + } + + if (data.itemData.documents.length > 0) { + if (typeof data.itemData.documents[0] === "string") { + // Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure + return; + } + + let manufacturer = data.itemData.manufacturer; + let modelName = data.itemData.model; + let description = data.itemData.description; + let productID = data.itemData.productID; + + let firstEnglish = data.itemData.documents.find((document) => isEnglish(document)); + + let bestDocument = (firstEnglish != null) + ? firstEnglish + : data.itemData.documents[0]; + + let bestDocumentIsEnglish = isEnglish(bestDocument); + + createDatasheet(api, { + priority: (bestDocumentIsEnglish) + ? 0.6 + : 0.5, + source: "tme", + manufacturer: manufacturer, + productID: productID, + name: modelName, + description: description, + url: bestDocument.url, + // NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later. + fixCasing: true + }); + } + } + }, + } + }; +}; diff --git a/lib/st/extract-id.js b/lib/st/extract-id.js deleted file mode 100644 index c70292e..0000000 --- a/lib/st/extract-id.js +++ /dev/null @@ -1,12 +0,0 @@ -"use strict"; - -module.exports = function extractID(string) { - // Quick-and-dirty category ID parsing from category pages - let match = /"prmisID":"([^"]+)"/.exec(string); - - if (match != null) { - return match[1]; - } else { - throw new Error(`ST: prmis ID expected but not found`); - } -}; diff --git a/lib/st/task/find-categories.js b/lib/st/task/find-categories.js deleted file mode 100644 index f7f2d6d..0000000 --- a/lib/st/task/find-categories.js +++ /dev/null @@ -1,28 +0,0 @@ -"use strict"; - -const cheerio = require("cheerio"); -const url = require("url"); - -const assureResponse = require("../../shared/assure-response"); - -module.exports = function findCategories({ session }) { - return async function ({ createItem }) { - let response = await session.get("https://www.st.com/content/st_com/en.html"); - assureResponse(response); - - let $ = cheerio.load(response.body); - - let links = $("#Top_Menu_Products :is(a.st-nav__blockmenu-title, a.st-nav__blockmenu-link)") - .toArray() - .map((element) => $(element).attr("href")) - .map((relativeURL) => url.resolve("https://www.st.com/", relativeURL)); - - for (let link of links) { - createItem({ - id: `st:category:${link}`, - tags: [ "st:category" ], - data: { url: link } - }); - } - }; -}; diff --git a/lib/st/task/normalize-product.js b/lib/st/task/normalize-product.js deleted file mode 100644 index c64c96a..0000000 --- a/lib/st/task/normalize-product.js +++ /dev/null @@ -1,20 +0,0 @@ -"use strict"; - -const htmlEntities = require("html-entities"); -const createDatasheet = require("../../shared/create-datasheet"); - -module.exports = function normalizeProduct() { - return async function (api) { - let { data } = api; - - createDatasheet(api, { - priority: 0.8, - source: "st", - manufacturer: "STMicroelectronics", - productID: data.productId, - name: data.cellData["XJE010_VT-007"], - description: htmlEntities.decode(data.cellData["XJE014_VT-007"]), - url: data.datasheetLink - }); - }; -}; diff --git a/lib/st/task/scrape-category.js b/lib/st/task/scrape-category.js deleted file mode 100644 index 7a5d0f3..0000000 --- a/lib/st/task/scrape-category.js +++ /dev/null @@ -1,60 +0,0 @@ -"use strict"; - -const assert = require("assert"); -const syncpipe = require("syncpipe"); - -const assureResponse = require("../../shared/assure-response"); -const extractId = require("../extract-id"); - -module.exports = function scrapeCategory({ session }) { - return async function({ data, createItem }) { - let response = await session.get(data.url); - assureResponse(response); - - let prmisID = extractId(response.body.toString()); - - let listingResponse = await session.get(`https://www.st.com/en/documentation/scraper.cxst-ps-grid.html/${encodeURIComponent(prmisID)}.json`, { noDecode: true }); - assureResponse(response); - let listingBuffer = listingResponse.body; - - if (listingBuffer.length > 0) { - // This is a category that has a product explorer - let listing = JSON.parse(listingBuffer.toString()); - - let cellNames = listing.columns.map((column) => { - let cellName = (column.identifier != null) - ? `${column.identifier}_${column.qualifier_identifier}` - : `nonstandard:${column.name}:${column.qualifier}` - - createItem({ - id: `st:column:${cellName}`, - tags: [ "st:column" ], - data: column - }); - - return cellName; - }); - - for (let row of listing.rows) { - assert(row.productId != null); - - let cellData = syncpipe(row.cells, [ - (_) => _.map((cell, i) => [ cellNames[i], cell.value ]), - (_) => Object.fromEntries(_) - ]); - - createItem({ - id: `st:product:${row.productId}`, - tags: [ "st:product" ], - data: { - ... row, - cells: undefined, - cellData: cellData - } - }); - } - } else { - console.warn("Warning: empty response, category does not have product explorer"); - } - }; -}; diff --git a/lib/st/task/scrape-product.js b/lib/st/task/scrape-product.js deleted file mode 100644 index 7126401..0000000 --- a/lib/st/task/scrape-product.js +++ /dev/null @@ -1,81 +0,0 @@ -"use strict"; - -const cheerio = require("cheerio"); -const url = require("url"); - -const assureResponse = require("../../shared/assure-response"); -const getUntaggedText = require("../../shared/get-untagged-text"); - -module.exports = function scrapeProduct({ session }) { - return async function({ data, createItem, updateData, expireDependents }) { - if (data.productFolderUrl == null) { - throw new Error(`No known product page URL`); - } - - let productPageURL = url.resolve("https://www.st.com/", data.productFolderUrl); - - let response = await session.get(productPageURL); - assureResponse(response); - - let $ = cheerio.load(response.body); - - let datasheetLink = $("a[data-js='datasheetLink']").attr("href"); - let datasheetURL = (datasheetLink != null) - ? url.resolve(productPageURL, datasheetLink) - : null; - - let resources = $(".st-table--resources") - .find("h3").toArray() - .map((heading) => { - let $heading = $(heading); - let sectionID = $heading.attr("id"); - let sectionTitle = $heading.text().trim(); - - let $table = $heading.next("table"); - - let items = $table - .find("tbody tr").toArray() - .map((row) => { - let $row = $(row); - let $mainView = $row.find(".visible-on-desktop-only"); - let $link = $mainView.find("a.st-link"); - - return { - url: url.resolve(productPageURL, $link.attr("href")), - documentID: $link.find("span.st-font--bold").text().trim(), - description: $link.find("span:not(.st-font--bold)").text().trim(), - version: getUntaggedText($link), - date: $row.find(".visible-on-desktop-only[data-latest-update]").text().trim() - }; - }); - - return { - sectionID: sectionID, - sectionTitle: sectionTitle, - items: items - }; - }); - - updateData((data) => { - return { - ... data, - datasheetLink: datasheetURL, - resources: resources - }; - }); - - expireDependents(); - - for (let section of resources) { - for (let resource of section.items) { - createItem({ - id: `st:resource:${resource.url}`, - tags: (resource.url === datasheetURL) - ? [ "st:resource", "st:datasheet" ] - : [ "st_resource" ], - data: { url: resource.url } - }); - } - } - }; -}; diff --git a/lib/tme/task/normalize-product.js b/lib/tme/task/normalize-product.js deleted file mode 100644 index 9412abb..0000000 --- a/lib/tme/task/normalize-product.js +++ /dev/null @@ -1,47 +0,0 @@ -"use strict"; - -const createDatasheet = require("../../shared/create-datasheet"); - -function isEnglish(document) { - return /\sen\s*$/i.test(document.description); -} - -module.exports = function tmeNormalizeProduct() { - return async function (api) { - let { data } = api; - - if (data.itemData.documents.length > 0) { - if (typeof data.itemData.documents[0] === "string") { - // Temporary workaround for the dependsOn not taking into account task versions, and some old records existing with a wrong documents structure - return; - } - - let manufacturer = data.itemData.manufacturer; - let modelName = data.itemData.model; - let description = data.itemData.description; - let productID = data.itemData.productID; - - let firstEnglish = data.itemData.documents.find((document) => isEnglish(document)); - - let bestDocument = (firstEnglish != null) - ? firstEnglish - : data.itemData.documents[0]; - - let bestDocumentIsEnglish = isEnglish(bestDocument); - - createDatasheet(api, { - priority: (bestDocumentIsEnglish) - ? 0.6 - : 0.5, - source: "tme", - manufacturer: manufacturer, - productID: productID, - name: modelName, - description: description, - url: bestDocument.url, - // NOTE: Most (but not all!) manufacturers on TME are, incorrectly, in ALL-CAPS. This 'fixes' those cases through best-effort capitalization. Many (but less!) will still be wrong and need to be fixed later. - fixCasing: true - }); - } - }; -}; diff --git a/lib/tme/task/scrape-product.js b/lib/tme/task/scrape-product.js deleted file mode 100644 index da042c0..0000000 --- a/lib/tme/task/scrape-product.js +++ /dev/null @@ -1,86 +0,0 @@ -"use strict"; - -const assert = require("assert"); -const cheerio = require("cheerio"); -const syncpipe = require("syncpipe"); -const url = require("url"); - -const assureResponse = require("../../shared/assure-response"); - -function firstMatch(options) { - for (let option of options) { - if (option != null && option !== "") { - return option; - } - } -} - -module.exports = function tmeScrapeProduct({ session }) { - return async function ({ data, createAlias, updateData, expireDependents }) { - let response = await session.get(data.url); - assureResponse(response); - - let $ = cheerio.load(response.body); - - // FIXME: This is currently broken! - let allMetaHeaders = syncpipe($("h2.o-semantic-only-header").toArray(), [ - (_) => _.map((header) => { - let $header = $(header); - - return [ - $header.find(".name").text().trim(), - $header.find(".value").text().trim() - ]; - }), - (_) => Object.fromEntries(_) - ]); - - let descriptionElement = $(".c-pip__description > h2").eq(0); - - let itemData = { - productID: $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), - manufacturer: $("a.pip__product-header-title").eq(0).text().trim(), - model: firstMatch([ - $("h2.c-pip__symbol--producer .c-pip__symbol-value").eq(0).text().trim(), // Manufacturer part number - $("h2.c-pip__symbol--tme .c-pip__symbol-value").eq(0).text().trim(), // TME Symbol - ]), - description: (descriptionElement.children().length === 0) // This skips meta fields if there is no description element - ? descriptionElement.text().trim() - : null, - documents: $("div.c-pip__document > a").toArray() - .map((link) => { - let relativeLink = $(link).attr("href"); - - if (relativeLink != null) { - return { - description: $(link).text().trim(), - url: url.resolve( - data.url, - relativeLink - ) - }; - } else { - // Probably a video popup - return null; - } - }) - .filter((item) => { - return item != null; - }), - // TODO: Scrape prices - }; - - assert(itemData.productID != null); - assert(itemData.manufacturer != null); - assert(itemData.model != null); - - createAlias({ from: `tme:product:${itemData.productID}` }); - - updateData((oldData) => ({ - ... oldData, - itemData: itemData - })); - - expireDependents(); - }; -}; diff --git a/lib/tme/task/scrape-sitemap.js b/lib/tme/task/scrape-sitemap.js deleted file mode 100644 index 3bb61e7..0000000 --- a/lib/tme/task/scrape-sitemap.js +++ /dev/null @@ -1,50 +0,0 @@ -"use strict"; - -const assert = require("assert"); - -const pipe = require("@promistream/pipe"); -const simpleSink = require("@promistream/simple-sink"); -const assureResponse = require("../../shared/assure-response"); -const parseSitemapResponse = require("../../shared/parse-sitemap-response"); - -module.exports = function tmeScrapeSitemap({ session }) { - return async function ({ data, createItem }) { - let response = await session.get(data.url, { stream: true }); - assureResponse(response); - - let resultCount = 0; - - await pipe([ - parseSitemapResponse(response), - simpleSink((item) => { - assert(item.url); - - if (item.type === "sitemap") { - // NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc. - if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) { - createItem({ - id: `tme:sitemap:${item.url}`, - tags: [ "tme:sitemap" ], - data: { url: item.url } - }); - - resultCount += 1; - } - } else if (item.type === "url") { - if (item.url.startsWith("https://www.tme.eu/en/details/")) { - createItem({ - id: `tme:product:${item.url}`, - tags: [ "tme:product" ], - data: { url: item.url } - }); - - resultCount += 1; - } - } - }) - ]).read(); - - // If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything. - assert(resultCount > 0); - }; -}; diff --git a/package.json b/package.json index e44f0b9..774f726 100644 --- a/package.json +++ b/package.json @@ -24,12 +24,13 @@ "html-entities": "^2.1.1", "map-obj": "^4.2.0", "match-value": "^1.1.0", + "merge-by-template": "^0.1.4", "pianola": "^2.2.1", "surgeon": "^3.16.4", "syncpipe": "^1.0.0" }, "devDependencies": { - "@joepie91/eslint-config": "^1.1.0", + "@joepie91/eslint-config": "^1.1.1", "eslint": "^7.22.0" } } diff --git a/test-import.js b/test-import.js new file mode 100644 index 0000000..3e09b66 --- /dev/null +++ b/test-import.js @@ -0,0 +1,3 @@ +"use strict"; + +console.log(require("fix-esm").require("execall").toString()) diff --git a/todo.txt b/todo.txt new file mode 100644 index 0000000..52cbe78 --- /dev/null +++ b/todo.txt @@ -0,0 +1,7 @@ +- add source: http://www.injoinic.com/product_detail/id/21.html +- LCSC: remove placeholder URLs that aren't actually datasheets +- rework createDatasheet: + - retain alternate options + - track language of each entry (when known) + - also retain non-datasheet documentation +- add source: realtek diff --git a/yarn.lock b/yarn.lock index f59b73b..8061d85 100644 --- a/yarn.lock +++ b/yarn.lock @@ -263,10 +263,10 @@ resolved "https://registry.yarnpkg.com/@joepie91/consumable/-/consumable-1.0.1.tgz#fd223a481b89b43bfe98687bd7f7ce586826f832" integrity sha512-LUOoJmFAJ6ocqymtVUiADFvx7T+EFQsfsY6LAOvYBKHlxpWQ/LiQGAi/k5tzATxXpH4/vLC4C9ttRl09/g+HRw== -"@joepie91/eslint-config@^1.1.0": - version "1.1.0" - resolved "https://registry.yarnpkg.com/@joepie91/eslint-config/-/eslint-config-1.1.0.tgz#9397e6ce0a010cb57dcf8aef8754d3a5ce0ae36a" - integrity sha512-XliasRSUfOz1/bAvTBaUlCjWDbceCW4y1DnvFfW7Yw9p2FbNRR0w8WoPdTxTCjKuoZ7/OQMeBxIe2y9Qy6rbYw== +"@joepie91/eslint-config@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@joepie91/eslint-config/-/eslint-config-1.1.1.tgz#cb276dec6dd25b5777daefbef561850c9717180d" + integrity sha512-q8l83tdpL0YGC24ftlpeHgmQIIRmcpiVhwwEUFPcJ1YXWaee/JjoUs6e5tLKMTNNk+fvDKtq2YPSXkmLQU7h5Q== "@joepie91/unreachable@^1.0.0": version "1.0.0" @@ -588,7 +588,7 @@ "@validatem/is-array" "^0.1.0" "@validatem/validation-result" "^0.1.1" -"@validatem/combinator@^0.1.0", "@validatem/combinator@^0.1.1": +"@validatem/combinator@^0.1.0", "@validatem/combinator@^0.1.1", "@validatem/combinator@^0.1.2": version "0.1.2" resolved "https://registry.yarnpkg.com/@validatem/combinator/-/combinator-0.1.2.tgz#eab893d55f1643b9c6857eaf6ff7ed2a728e89ff" integrity sha512-vE8t1tNXknmN62FlN6LxQmA2c6TwVKZ+fl/Wit3H2unFdOhu7SZj2kRPGjAXdK/ARh/3svYfUBeD75pea0j1Sw== @@ -645,6 +645,32 @@ supports-color "^7.1.0" syncpipe "^1.0.0" +"@validatem/core@^0.3.3": + version "0.3.17" + resolved "https://registry.yarnpkg.com/@validatem/core/-/core-0.3.17.tgz#1756a7eca0523a3657794d2060273f7d42c083ef" + integrity sha512-VahE9TAKpaU13BcVQI/Dc9j/xsm/BgloRM0v1HjOMpoJ16tOkKQkUdOgiDCG4zmEek1bG3v9Zu4lS1lubgjLMw== + dependencies: + "@validatem/annotate-errors" "^0.1.2" + "@validatem/any-property" "^0.1.0" + "@validatem/error" "^1.0.0" + "@validatem/match-validation-error" "^0.1.0" + "@validatem/match-versioned-special" "^0.1.0" + "@validatem/match-virtual-property" "^0.1.0" + "@validatem/normalize-rules" "^0.1.0" + "@validatem/required" "^0.1.0" + "@validatem/validation-result" "^0.1.1" + "@validatem/virtual-property" "^0.1.0" + as-expression "^1.0.0" + assure-array "^1.0.0" + create-error "^0.3.1" + default-value "^1.0.0" + execall "^2.0.0" + flatten "^1.0.3" + indent-string "^4.0.0" + is-arguments "^1.0.4" + supports-color "^7.1.0" + syncpipe "^1.0.0" + "@validatem/default-to@^0.1.0": version "0.1.0" resolved "https://registry.yarnpkg.com/@validatem/default-to/-/default-to-0.1.0.tgz#62766a3ca24d2f61a96c713bcb629a5b3c6427c5" @@ -697,7 +723,7 @@ default-value "^1.0.0" flatten "^1.0.3" -"@validatem/is-array@^0.1.0": +"@validatem/is-array@^0.1.0", "@validatem/is-array@^0.1.1": version "0.1.1" resolved "https://registry.yarnpkg.com/@validatem/is-array/-/is-array-0.1.1.tgz#fbe15ca8c97c30b622a5bbeb536d341e99cfc2c5" integrity sha512-XD3C+Nqfpnbb4oO//Ufodzvui7SsCIW/stxZ39dP/fyRsBHrdERinkFATH5HepegtDlWMQswm5m1XFRbQiP2oQ== @@ -853,6 +879,15 @@ default-value "^1.0.0" split-filter-n "^1.1.2" +"@validatem/wrap-path@^0.1.0": + version "0.1.0" + resolved "https://registry.yarnpkg.com/@validatem/wrap-path/-/wrap-path-0.1.0.tgz#777998b62d3e74f2b2897c992dae9b3675161c33" + integrity sha512-6hOqydnr4u8FA0iRv8fyXxsr64T99+w/XL/fixmsgN0uqulEIwGMxCre3y9YkFNcEtysyPHkQl0CrGPcASsZxw== + dependencies: + "@validatem/annotate-errors" "^0.1.2" + "@validatem/combinator" "^0.1.2" + "@validatem/validation-result" "^0.1.2" + "@validatem/wrap-value-as-option@^0.1.0": version "0.1.0" resolved "https://registry.yarnpkg.com/@validatem/wrap-value-as-option/-/wrap-value-as-option-0.1.0.tgz#57fa8d535f6cdf40cf8c8846ad45f4dd68f44568" @@ -1969,6 +2004,23 @@ match-value@^1.1.0: resolved "https://registry.yarnpkg.com/match-value/-/match-value-1.1.0.tgz#ad311ef8bbe2d344a53ec3104e28fe221984b98e" integrity sha512-NOvpobcmkX+l9Eb6r2s3BkR1g1ZwzExDFdXA9d6p1r1O1olLbo88KuzMiBmg43xSpodfm7I6Hqlx2OoySquEgg== +merge-by-template@^0.1.4: + version "0.1.4" + resolved "https://registry.yarnpkg.com/merge-by-template/-/merge-by-template-0.1.4.tgz#8a03e6383a4e2f2e4a6460bff0d6d3e7b468a535" + integrity sha512-10h5HyGLJJu1F1z02oMqpvMa6oraLr7Vp0gPxlw6Od8xlvzTFr0TQGPZXMLBmZlhZRY910AXGJ6AFc2iXGZ7uQ== + dependencies: + "@validatem/core" "^0.3.3" + "@validatem/default-to" "^0.1.0" + "@validatem/is-array" "^0.1.1" + "@validatem/is-boolean" "^0.1.1" + "@validatem/is-plain-object" "^0.1.1" + "@validatem/remove-nullish-items" "^0.1.0" + "@validatem/virtual-property" "^0.1.0" + "@validatem/wrap-path" "^0.1.0" + default-value "^1.0.0" + fromentries "^1.2.0" + range "^0.0.3" + mime@^1.3.4: version "1.6.0" resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1" @@ -2208,6 +2260,11 @@ randexp@0.4.6: discontinuous-range "1.0.0" ret "~0.1.10" +range@^0.0.3: + version "0.0.3" + resolved "https://registry.yarnpkg.com/range/-/range-0.0.3.tgz#b5b8eb2463a516b624a563bd32b18fe89e70151b" + integrity sha512-OxK2nY2bmeEB4NxoBraQIBOOeOIxoBvm6yt8MA1kLappgkG3SyLf173iOtT5woWycrtESDD2g0Nl2yt8YPoUnw== + readable-stream@^2.2.2: version "2.3.7" resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.7.tgz#1eca1cf711aef814c04f62252a36a62f6cb23b57"