scraper-config/index.js

"use strict";

const bhttp = require("bhttp");
const got = require("got");
const mergeSources = require("./lib/merge-sources");

const assureResponse = require("./lib/shared/assure-response");

let state = {
	session: bhttp.session({
		headers: {
			"user-agent": (process.env.NODE_ENV === "production")
				? "seekseek.org crawler (seekseek.org/contact)"
				: "seekseek.org crawler, development mode (seekseek.org/contact)"
		}
	}),
	// For HTTP/2, until bhttp gains HTTP/2 support
	gotSession: got.extend({
		http2: true,
		headers: {
			// "user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"
			"user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
		},
		hooks: {
			afterResponse: [(response) => {
				assureResponse(response);
				return response;
			}]
		}
	})
};

let baseSchema = {
	backend: "postgresql",
	database: {
		host: "/run/postgresql",
		database: "seekseek_documentation",
		pool: {
			max: 75
		}
	},
	seed: [],
	tags: {},
	tasks: {}
};

// NOTE: This is *not* currently a fully modular system! Identifiers (tags, task IDs, etc.) are still global to the srap instance as a whole, even though the code exists in different modules. Prefixing identifiers with the scraper they originate from, is still necessary!
module.exports = mergeSources(baseSchema, [
	require("./lib/sources/datasheets/lcsc")(state),
	require("./lib/sources/datasheets/tme")(state),
	require("./lib/sources/datasheets/st")(state),
	require("./lib/sources/datasheets/focus-lcds")(state),
]);
Initial commit 3 years ago			`"use strict";`

			`const bhttp = require("bhttp");`
Add Farnell support, support auto-casemapping, WIP Surgeon support 2 years ago			`const got = require("got");`
First pass at refactoring for modularity 1 year ago			`const mergeSources = require("./lib/merge-sources");`
Initial commit 3 years ago
Add Farnell support, support auto-casemapping, WIP Surgeon support 2 years ago			`const assureResponse = require("./lib/shared/assure-response");`
Initial commit 3 years ago
Add LCSC scraper, add priority logic for datasheet entry normalization 3 years ago			`let state = {`
			`session: bhttp.session({`
			`headers: {`
Improve user agent 2 years ago			`"user-agent": (process.env.NODE_ENV === "production")`
			`? "seekseek.org crawler (seekseek.org/contact)"`
			`: "seekseek.org crawler, development mode (seekseek.org/contact)"`
Add LCSC scraper, add priority logic for datasheet entry normalization 3 years ago			`}`
Add Farnell support, support auto-casemapping, WIP Surgeon support 2 years ago			`}),`
			`// For HTTP/2, until bhttp gains HTTP/2 support`
			`gotSession: got.extend({`
			`http2: true,`
			`headers: {`
			`// "user-agent": "seekseek.org beta crawler (contact/problems: admin@cryto.net)"`
			`"user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"`
			`},`
			`hooks: {`
			`afterResponse: [(response) => {`
			`assureResponse(response);`
			`return response;`
			`}]`
			`}`
Add LCSC scraper, add priority logic for datasheet entry normalization 3 years ago			`})`
			`};`
Initial commit 3 years ago
First pass at refactoring for modularity 1 year ago			`let baseSchema = {`
Update for refactored backend, fix errors 1 year ago			`backend: "postgresql",`
Initial commit 3 years ago			`database: {`
			`host: "/run/postgresql",`
Increase pool size 1 year ago			`database: "seekseek_documentation",`
			`pool: {`
Avoid overloading the CPU and database 1 year ago			`max: 75`
Increase pool size 1 year ago			`}`
Initial commit 3 years ago			`},`
First pass at refactoring for modularity 1 year ago			`seed: [],`
			`tags: {},`
			`tasks: {}`
Initial commit 3 years ago			`};`
First pass at refactoring for modularity 1 year ago
			`// NOTE: This is not currently a fully modular system! Identifiers (tags, task IDs, etc.) are still global to the srap instance as a whole, even though the code exists in different modules. Prefixing identifiers with the scraper they originate from, is still necessary!`
			`module.exports = mergeSources(baseSchema, [`
			`require("./lib/sources/datasheets/lcsc")(state),`
			`require("./lib/sources/datasheets/tme")(state),`
			`require("./lib/sources/datasheets/st")(state),`
			`require("./lib/sources/datasheets/focus-lcds")(state),`
			`]);`