From cc7458e4108a8a2cfaf236e8a4d060cd09a0d268 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sat, 27 Mar 2021 20:43:08 +0100 Subject: [PATCH] Fix license, add some docs --- README.md | 13 +++++++++++++ package.json | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..0df4f82 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +This repository contains documentation-related scrapers for seekseek.org. + +## Contributions + +Currently, the tools used for scraping are not documented. So while contributions are welcome, it will probably be a bit of work to figure out how to write a scraper :) This will change soon(tm). There may be [issues that need your help](https://git.cryto.net/seekseek/scrape-documentation/issues?q=&type=all&sort=&state=open&labels=18&milestone=0&assignee=0) on the issue tracker, though. + +By submitting a contribution, you agree to license it under the WTFPL/CC0 like the rest of the codebase, which effectively means making it public domain and free for anyone to use for any purpose. + +## Scraper development guidelines + +1. __Store dense information.__ Avoid storing things like raw HTML which mostly contain repetitive/template content. Storing structured data (eg. parsed JSON) is ideal, but things like HTML snippets with high information density are okay too. +2. __Store original information.__ Don't try to parse meaning directly out of the scraped data, other than for discovering new items! Data normalization is a lossy process and should happen in a dedicated normalization task; that way we don't need to rescrape the entire source just because of a small change in the data normalization code. +3. __Store maximum information.__ There's no need to selectively pick out bits of information to store; if it's easy to extract more data than you are strictly looking for (eg. the data is presented in JSON format), then please do so and just store it in the results! This allows for extracting more information from it later, when building other or new search engines. An example of this is how some scrapers store technical properties of components, even though what we're currently looking for is just datasheets. diff --git a/package.json b/package.json index b641ffc..2eb10fa 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,7 @@ "main": "index.js", "repository": "git@git.cryto.net:seekseek/scrape-documentation.git", "author": "Sven Slootweg ", - "license": "MIT", + "license": "WTFPL OR CC0-1.0", "dependencies": { "bhttp": "^1.2.8", "bluebird": "^3.7.2",