From 9e680d0882750a3b4689327b100830f9cea9781f Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 12:56:31 +0100 Subject: [PATCH 01/10] Simplify layout and templates. By removing wrapper elements and switching to a CSS grid layout there are overall less elements to style while providing the same possibilities. --- public/css/style.css | 109 +++++++++++++++++++++------------------- src/css/style.css | 113 ++++++++++++++++++++++-------------------- src/views/_layout.jsx | 48 ++++++++---------- 3 files changed, 139 insertions(+), 131 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index da13113..2a1ba2e 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -1,76 +1,81 @@ +/*$background: #f9f9f9ff;*/ + +* { + box-sizing: border-box; +} + html, body { /* background-color: rgb(248, 249, 236); */ background-color: #f9f9f9ff; font-family: sans-serif; - margin: 0; padding: 0; + min-height: 100%; } -.wrapper { - max-width: 900px; - margin-left: 1em; - margin-right: 1em; +html { + margin: 0; + height: 100%; } -.header { - margin-top: 1em; - margin-left: 0.5em; - margin-right: 0.5em; - padding-bottom: .7em; +body { + height: 100%; + margin: 0 auto; + display: grid; + grid-template: + "header" 7rem + "content" 1fr + "footer" 50px; +} + +header { + grid-area: header; border-bottom: 2px solid black; + align-self: center; } -.contents { +header .logoContainer { + display: inline-block; + position: relative; + } + +header .logoContainer .logo { + height: 5rem; + } + +header .logoContainer .siteTag { + position: absolute; + right: 0; + bottom: 0; + + color: teal; + font-size: 1.3em; + } + +header .logoContainer .betaTag { + width: 1px; /* Out-of-box alignment hack */ + position: absolute; + right: -.3em; + bottom: 0; + font-style: italic; + + color: rgb(218, 13, 13); + font-size: 1.3em; + } + +main { + grid-area: content; margin-bottom: 2em; - padding: .7em 0; } -.footer { - position: fixed; - bottom: 0; - left: 0.5em; - right: 0.5em; +main div.search input { width: 100%; } - box-sizing: border-box; +footer { + grid-area: footer; border-top: 1px solid black; height: 2em; - padding: 0.2em; background-color: #f9f9f9ff; } -.footer .wrapper { - margin-left: calc(1em - 0.5em); - } - -.logoContainer { - display: inline-block; - position: relative; -} - -.logoContainer .logo { - height: 5em; - } - -.logoContainer .siteTag { - position: absolute; - right: 0; - bottom: 0; - - color: teal; - font-size: 1.3em; - } - -.logoContainer .betaTag { - width: 1px; /* Out-of-box alignment hack */ - position: absolute; - right: -.3em; - bottom: 0; - font-style: italic; - - color: rgb(218, 13, 13); - font-size: 1.3em; - } - .counter { margin-bottom: .5em; font-style: italic; diff --git a/src/css/style.css b/src/css/style.css index 6ef97ab..be08768 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -1,77 +1,84 @@ $pagePadding: 0.5em; +/*$background: #f9f9f9ff;*/ $background: #f9f9f9ff; +* { + box-sizing: border-box; +} + html, body { /* background-color: rgb(248, 249, 236); */ background-color: $background; font-family: sans-serif; - margin: 0; padding: 0; + min-height: 100%; } -.wrapper { - max-width: 900px; - margin-left: 1em; - margin-right: 1em; +html { + margin: 0; + height: 100%; } -.header { - margin-top: 1em; - margin-left: $pagePadding; - margin-right: $pagePadding; - padding-bottom: .7em; +body { + height: 100%; + margin: 0 auto; + display: grid; + grid-template: + "header" 7rem + "content" 1fr + "footer" 50px; +} + +header { + grid-area: header; border-bottom: 2px solid black; + align-self: center; + + .logoContainer { + display: inline-block; + position: relative; + + .logo { + height: 5rem; + } + + .siteTag { + position: absolute; + right: 0; + bottom: 0; + + color: teal; + font-size: 1.3em; + } + + .betaTag { + width: 1px; /* Out-of-box alignment hack */ + position: absolute; + right: -.3em; + bottom: 0; + font-style: italic; + + color: rgb(218, 13, 13); + font-size: 1.3em; + } + } } -.contents { +main { + grid-area: content; margin-bottom: 2em; - padding: .7em 0; -} -.footer { - position: fixed; - bottom: 0; - left: $pagePadding; - right: $pagePadding; - box-sizing: border-box; + div.search { + input { width: 100%; } + } +} + +footer { + grid-area: footer; border-top: 1px solid black; height: 2em; - padding: 0.2em; background-color: $background; - - .wrapper { - margin-left: calc(1em - $pagePadding); - } -} - -.logoContainer { - display: inline-block; - position: relative; - - .logo { - height: 5em; - } - - .siteTag { - position: absolute; - right: 0; - bottom: 0; - - color: teal; - font-size: 1.3em; - } - - .betaTag { - width: 1px; /* Out-of-box alignment hack */ - position: absolute; - right: -.3em; - bottom: 0; - font-style: italic; - - color: rgb(218, 13, 13); - font-size: 1.3em; - } } .counter { diff --git a/src/views/_layout.jsx b/src/views/_layout.jsx index ff6b8bf..5d63390 100644 --- a/src/views/_layout.jsx +++ b/src/views/_layout.jsx @@ -13,33 +13,29 @@ module.exports = function Layout({ children }) { -
-
- - seekseek logo - datasheets - beta - -
-
-
+
+ + seekseek logo + datasheets + beta + +
+
{children} -
-
-
- - Come chat with us! - - - - Technology - - - - Contact/Abuse - -
-
+ + -- 2.40.1 From 0c6ba5e1e4813b01611c8fcf117b1a2ff141b454 Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 12:58:37 +0100 Subject: [PATCH 02/10] Add media queries for larger screens. It is hard for people to keep track of very wide pages (and sentences), with these mediaqueries the width of the site is limited on screens that are wider than a 1000, or 1400 pixels. --- public/css/style.css | 20 ++++++++++++++++++++ src/css/style.css | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/public/css/style.css b/public/css/style.css index 2a1ba2e..04d482d 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -91,3 +91,23 @@ footer { .linkSpacer { margin: 0 .5em; } + +@media only screen and (max-width: 1000px) { + body { + width: 100%; + padding: 0 1rem; + } +} + +@media only screen and (min-width: 1000px) { + body { + width: 80%; + } +} + +@media only screen and (min-width: 1400px) { + body { + width: 60rem; + } +} + diff --git a/src/css/style.css b/src/css/style.css index be08768..c2402d9 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -96,3 +96,23 @@ footer { .linkSpacer { margin: 0 .5em; } + +@media only screen and (max-width: 1000px) { + body { + width: 100%; + padding: 0 1rem; + } +} + +@media only screen and (min-width: 1000px) { + body { + width: 80%; + } +} + +@media only screen and (min-width: 1400px) { + body { + width: 60rem; + } +} + -- 2.40.1 From a351c5d8f14369562b95d72b138cc8c1f841d487 Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 13:22:25 +0100 Subject: [PATCH 03/10] Dark mode through media query. Browsers can prefer a certain color scheme. This commit implements a new dark color scheme for the website which is supposed to be a placeholder, better colors could be chosen. --- public/css/style.css | 31 ++++++++++++++++++++++++------- src/css/style.css | 34 ++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index 04d482d..b7a6ce3 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -1,12 +1,26 @@ -/*$background: #f9f9f9ff;*/ +:root { + --color-background: #f9f9f9; + --color-text: #000000; + --color-text-subdued: #4f4e4e; + --color-text-attention: teal; +} + +@media (prefers-color-scheme: dark) { + :root { + --color-background: #111111; + --color-text: #f9f9f9; + --color-text-subdued: #cccccc; + --color-text-attention: teal; + } +} * { box-sizing: border-box; } html, body { - /* background-color: rgb(248, 249, 236); */ - background-color: #f9f9f9ff; + color: var(--color-text); + background: var(--color-background); font-family: sans-serif; padding: 0; min-height: 100%; @@ -47,7 +61,7 @@ header .logoContainer .siteTag { right: 0; bottom: 0; - color: teal; + color: var(--color-text-attention); font-size: 1.3em; } @@ -73,7 +87,10 @@ footer { grid-area: footer; border-top: 1px solid black; height: 2em; - background-color: #f9f9f9ff; +} + +main a, footer a { + color: var(--color-text-attention); } .counter { @@ -84,8 +101,8 @@ footer { } .staticContent { - margin: 0 2em; - max-width: 900px; + margin: 0 2em; + max-width: 900px; } .linkSpacer { diff --git a/src/css/style.css b/src/css/style.css index c2402d9..9d1ff86 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -1,14 +1,26 @@ -$pagePadding: 0.5em; -/*$background: #f9f9f9ff;*/ -$background: #f9f9f9ff; +:root { + --color-background: #f9f9f9; + --color-text: #000000; + --color-text-subdued: #4f4e4e; + --color-text-attention: teal; +} + +@media (prefers-color-scheme: dark) { + :root { + --color-background: #111111; + --color-text: #f9f9f9; + --color-text-subdued: #cccccc; + --color-text-attention: teal; + } +} * { box-sizing: border-box; } html, body { - /* background-color: rgb(248, 249, 236); */ - background-color: $background; + color: var(--color-text); + background: var(--color-background); font-family: sans-serif; padding: 0; min-height: 100%; @@ -47,7 +59,7 @@ header { right: 0; bottom: 0; - color: teal; + color: var(--color-text-attention); font-size: 1.3em; } @@ -68,7 +80,6 @@ main { grid-area: content; margin-bottom: 2em; - div.search { input { width: 100%; } } @@ -78,7 +89,10 @@ footer { grid-area: footer; border-top: 1px solid black; height: 2em; - background-color: $background; +} + +main a, footer a { + color: var(--color-text-attention); } .counter { @@ -89,8 +103,8 @@ footer { } .staticContent { - margin: 0 2em; - max-width: 900px; + margin: 0 2em; + max-width: 900px; } .linkSpacer { -- 2.40.1 From 925778b060cad3e1c48b42a2566ed1dd6ef9095c Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 13:27:54 +0100 Subject: [PATCH 04/10] Text line-height and link hovers, semantics. The general 'good readability' rule is to have a bit larger line-height, aside from this I've added some interactivity to links by hovering over them. I also replaced `staticContent` with `article` for semantics. --- public/css/style.css | 13 +- src/css/style.css | 17 ++- src/views/contact.jsx | 4 +- src/views/technology.jsx | 313 +++++++++++++++++++-------------------- 4 files changed, 175 insertions(+), 172 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index b7a6ce3..0a9b3f6 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -90,7 +90,13 @@ footer { } main a, footer a { - color: var(--color-text-attention); + color: var(--color-text-attention); + } + +main a:hover, footer a:hover { text-decoration: none; } + +article { + line-height: 1.25rem; } .counter { @@ -100,11 +106,6 @@ main a, footer a { text-align: right; } -.staticContent { - margin: 0 2em; - max-width: 900px; -} - .linkSpacer { margin: 0 .5em; } diff --git a/src/css/style.css b/src/css/style.css index 9d1ff86..ac58ff6 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -91,8 +91,16 @@ footer { height: 2em; } -main a, footer a { - color: var(--color-text-attention); +main, footer { + a { + color: var(--color-text-attention); + + &:hover { text-decoration: none; } + } +} + +article { + line-height: 1.25rem; } .counter { @@ -102,11 +110,6 @@ main a, footer a { text-align: right; } -.staticContent { - margin: 0 2em; - max-width: 900px; -} - .linkSpacer { margin: 0 .5em; } diff --git a/src/views/contact.jsx b/src/views/contact.jsx index 89d04db..359f9c4 100644 --- a/src/views/contact.jsx +++ b/src/views/contact.jsx @@ -7,7 +7,7 @@ const Layout = require("./_layout"); module.exports = function Contact() { return ( -
+

Do you have questions about SeekSeek, or do you want to contribute to the project?

Please join us in our Matrix room!

We actively watch the channel, and we're always happy to answer any questions you might have. If you have a criticism, we encourage you to share that too! The only requirement is that you do so constructively and respectfully. SeekSeek is a community project, and your feedback is crucial to its success!

@@ -27,7 +27,7 @@ module.exports = function Contact() {

If you would like to request a copyright takedown: If you are the copyright holder of datasheet(s) listed in our search, you can of course request that we remove these datasheets from the results, and we will do so if your report is valid.

However, we want to ask that you talk with us about it first - we are very open to addressing any practical concerns you may have, and it's not good for anybody to just remove them entirely. Your customers will find it more difficult to find documentation on your products, and that will do no good for your business either!

-
+
); }; diff --git a/src/views/technology.jsx b/src/views/technology.jsx index 3314843..e91ce13 100644 --- a/src/views/technology.jsx +++ b/src/views/technology.jsx @@ -7,164 +7,163 @@ const Layout = require("./_layout"); module.exports = function Technology() { return ( -
-

The technology

+
+

The technology

-

So... what makes SeekSeek tick? Let's get the boring bits out of the way first:

-
    -
  • The whole thing is written in Javascript, end-to-end, including the scraper.
  • -
  • Both the scraping server and the search frontend server run on NixOS.
  • -
  • PostgreSQL is used as the database, both for the scraper and the search frontends (there's only one - frontend the time of writing).
  • -
  • The search frontends use React for rendering the UI; server-side where possible, browser-side where - necessary.
  • -
  • Server-side rendering is done with a fork of express-react-views.
  • -
  • Most scraping tasks use bhttp as the HTTP client, and cheerio (a 'headless' implementation - of the jQuery API) for data extraction.
  • -
-

None of that is really very interesting, but people always ask about it. Let's move on to the interesting - bits!

-

The goal

-

Before we can talk about the technology, we need to talk about what the technology was built for. - SeekSeek is radical software. From the ground up, it was - designed to be FOSS, collaborative and community-driven, non-commercial, ad-free, and to improve the world - in - the case of SeekSeek specifically, to improve on the poor state of keyword-only searches by providing highly - specialized search engines instead!

-

But... that introduces some unusual requirements:

-
    -
  • It needs to be resource-conservative: While it doesn't need to be perfectly optimized, it shouldn't require absurd amounts of RAM or CPU power either. It should be possible to run - the whole thing on a desktop or a cheap server - the usual refrain of "extra servers are - cheaper than extra developers", a very popular one in startups, does not apply here.
  • -
  • It needs to be easy to spin up for development: The entire codebase needs to be - self-contained as much as reasonably possible, requiring not much more than an npm install to - get everything in place. No weirdly complex build stacks, no assumptions about how the developer's - system is laid out, and things need to be debuggable by someone who has never touched it before. It needs to - be possible for anybody to hack on it, not just a bunch of core developers.
  • -
  • It needs to be easy to deploy and maintain: It needs to work with commodity software on - standard operating systems, including in constrained environments like containers and VPSes. No weird kernel - settings, no complex network setup requirements. It needs to Just Work, and to keep working with - very little maintenance. Upgrades need to be seamless.
  • -
  • It needs to be flexible: Time is still a valuable resource in a collaborative project - - unlike a company, we can't assume that someone will be able to spend a working day restructuring the - entire codebase. Likewise, fundamental restructuring causes coordination issues across the community, - because a FOSS community is not a centralized entity with a manager who decides what happens. That means - that the core (extensible) architecture needs to be right from the start, and able to adapt to - changing circumstances, more so because scraping is involved.
  • -
  • It needs to be accessible: It should be possible for any developer to build and - contribute to scrapers; not just specialized developers who have spent half their life working on this sort - of thing. That means that the API needs to be simple, and there needs to be space for someone to use the - tools they are comfortable with.
  • -
-

At the time of writing, there's only a datasheet search engine. However, the long-term goal is for SeekSeek - to become a large collection of specialized search engines - each one with a tailor-made UI that's - ideal for the thing being searched through. So all of the above needs to be satisfied not just for a datasheet - search engine, but for a potentially unlimited series of search engines, many of which are not even on - the roadmap yet!

-

And well, the very short version is that none of the existing options that I've evaluated even came - close to meeting these requirements. Existing scraping stacks, job queues, and so on tend to very much - be designed for corporate environments with tight control over who works on what. That wasn't an option - here. So let's talk about what we ended up with instead!

-

The scraping server

-

The core component in SeekSeek is the 'scraping server' - an experimental project called srap that was built specifically for SeekSeek; though also - designed to be more generically useful. You can think of srap as a persistent job queue that's - optimized for scraping.

-

So what does that mean? The basic idea behind srap is that you have a big pile of "items" - each item - isn't much more than a unique identifier and some 'initial data' to represent the work to be done. - Each item can have zero or more 'tags' assigned, which are just short strings. Crucially, none of these - items do anything yet - they're really just a mapping from an identifier to some arbitrarily-shaped - JSON.

-

The real work starts with the scraper configuration. Even though it's called a - 'configuration', it's really more of a codebase - you can find the configuration that - SeekSeek uses here. You'll notice that it defines a number of tasks - and seed items. The seed items are simply inserted automatically if they don't exist yet, and define - the 'starting point' for the scraper.

-

The tasks, however, define what the scraper does. Every task represents one specific operation in the - scraping process; typically, there will be multiple tasks per source. One to find product categories, one to - extract products from a category listing, one to extract data from a product page, and so on. Each of these - tasks has its own concurrency settings, as well as a TTL (Time-To-Live) that defines after how long the scraper - should revisit it.

-

Finally, what wires it all together are the tag mappings. These define what tasks should be executed for - what tags - or more accurately, for all the items that are tagged with those tags. Tags associated with - items are dynamic, they can be added or removed by any scraping task. This provides a huge amount of - flexibility, because any task can essentially queue any other task, just by giving an item the right - tag. The scraping server then makes sure that it lands at the right spot in the queue at the right time - the - task itself doesn't need to care about any of that.

-

Here's a practical example, from the datasheet search tasks:

-
    -
  • The initial seed item for LCSC is tagged as lcsc:home.
  • -
  • The lcsc:home tag is defined to trigger the lcsc:findCategories task.
  • -
  • The lcsc:findCategories task fetches a list of categories from the source, and creates an item - tagged as lcsc:category for each.
  • -
  • The lcsc:category tag is then defined to trigger the lcsc:scrapeCategory task. -
  • -
  • The lcsc:scrapeCategory task (more or less) fetches all the products for a given category, and - creates items tagged as lcsc:product. Importantly, because the LCSC category listings - already include the product data we need, these items are immediately created with their full data - - there's no separate 'scape product page' task!
  • -
  • The lcsc:product tag is then defined to trigger the lcsc:normalizeProduct task. -
  • -
  • The lcsc:normalizeProduct task then converts the scraped data to a standardized representation, - which is stored with a result:datasheet tag. The scraping flows for other data sources - also produce result:datasheet items - these are the items that ultimately end up in - the search frontend!
  • -
-

One thing that's not mentioned above is that lcsc:scrapeCategory doesn't actually - scrape all of the items for a category - it just scrapes a specific page of them! The initial - lcsc:findCategories task would have created as many of such 'page tasks' as there are pages - to scrape, based on the amount of items a category is said to have.

-

More interesting, though, is that the scraping flow doesn't have to be this unidirectional - if the - total amount of pages could only be learned from scraping the first page, it would have been entirely possible - for the lcsc:scrapeCategory task to create additional lcsc:category items! - The tag-based system makes recursive discovery like this a breeze, and because everything is keyed by a unique - identifier and persistent, loops are automatically prevented.

-

You'll probably have noticed that none of the above mentions HTTP requests. That's because srap - doesn't care - it has no idea what HTTP even is! All of the actual scraping logic is completely - defined by the configuration - and that's what makes it a codebase. This is the scraping logic for extracting products from an LCSC category, for example. This is also why each page is - its own item; that allows srap to rate-limit requests despite having absolutely no hooks into the HTTP library - being used, by virtue of limiting each task to 1 HTTP request.

-

There are more features in srap, like deliberately invalidating past scraping results, item merges, and 'out - of band' task result storage, but these are the basic concepts that make the whole thing work. As you can - see, it's highly flexible, unopinionated, and easy to collaboratively maintain a scraper configuration for - - every task functions more or less independently.

-

The datasheet search frontend

-

If you've used the datasheet search, you've probably - noticed that it's really fast, it almost feels like it's all local. But no, your search queries - really are going to a server. So how can it be that fast?

-

It turns out to be surprisingly simple: by default, the search is a prefix search only. That means that - it will only search for items that start with the query you entered. This is usually what you want when you - search for part numbers, and it also has some very interesting performance implications - because a - prefix search can be done entirely on an index!

-

There's actually very little magic here - the PostgreSQL database that runs behind the frontend simply has a - (normalized) index on the column for the part number, and the server is doing a LIKE - 'yourquery%' query against it. That's it! This generally yields a search result in under - 2 milliseconds, ie. nearly instantly. All it has to do is an index lookup, and those are fast.

-

On the browser side, things aren't much more complicated. Every time the query changes, it makes a new search - request to the server, cancelling the old one if one was still in progress. When it gets results, it renders - them on the screen. That's it. There are no trackers on the site, no weird custom input boxes, nothing else - to slow it down. The result is a search that feels local :)

-

The source code

-

Right now, the source code for all of these things lives across three repositories:

- -

At the time of writing, documentation is still pretty lacking across these repositories, and the code in the srap - and UI repositories in particular is pretty rough! This will be improved upon quite soon, as SeekSeek becomes - more polished.

- -

Final words

-

Of course, there are many more details that I haven't covered in this post, but hopefully this gives you an idea of how SeekSeek is put together, and why!

-

Has this post made you interested in working on SeekSeek, or maybe your own custom srap-based project? Drop - by in the chat! We'd be happy to give you pointers :)

- -
+

So... what makes SeekSeek tick? Let's get the boring bits out of the way first:

+
    +
  • The whole thing is written in Javascript, end-to-end, including the scraper.
  • +
  • Both the scraping server and the search frontend server run on NixOS.
  • +
  • PostgreSQL is used as the database, both for the scraper and the search frontends (there's only one + frontend the time of writing).
  • +
  • The search frontends use React for rendering the UI; server-side where possible, browser-side where + necessary.
  • +
  • Server-side rendering is done with a fork of express-react-views.
  • +
  • Most scraping tasks use bhttp as the HTTP client, and cheerio (a 'headless' implementation + of the jQuery API) for data extraction.
  • +
+

None of that is really very interesting, but people always ask about it. Let's move on to the interesting + bits!

+

The goal

+

Before we can talk about the technology, we need to talk about what the technology was built for. + SeekSeek is radical software. From the ground up, it was + designed to be FOSS, collaborative and community-driven, non-commercial, ad-free, and to improve the world - in + the case of SeekSeek specifically, to improve on the poor state of keyword-only searches by providing highly + specialized search engines instead!

+

But... that introduces some unusual requirements:

+
    +
  • It needs to be resource-conservative: While it doesn't need to be perfectly optimized, it shouldn't require absurd amounts of RAM or CPU power either. It should be possible to run + the whole thing on a desktop or a cheap server - the usual refrain of "extra servers are + cheaper than extra developers", a very popular one in startups, does not apply here.
  • +
  • It needs to be easy to spin up for development: The entire codebase needs to be + self-contained as much as reasonably possible, requiring not much more than an npm install to + get everything in place. No weirdly complex build stacks, no assumptions about how the developer's + system is laid out, and things need to be debuggable by someone who has never touched it before. It needs to + be possible for anybody to hack on it, not just a bunch of core developers.
  • +
  • It needs to be easy to deploy and maintain: It needs to work with commodity software on + standard operating systems, including in constrained environments like containers and VPSes. No weird kernel + settings, no complex network setup requirements. It needs to Just Work, and to keep working with + very little maintenance. Upgrades need to be seamless.
  • +
  • It needs to be flexible: Time is still a valuable resource in a collaborative project - + unlike a company, we can't assume that someone will be able to spend a working day restructuring the + entire codebase. Likewise, fundamental restructuring causes coordination issues across the community, + because a FOSS community is not a centralized entity with a manager who decides what happens. That means + that the core (extensible) architecture needs to be right from the start, and able to adapt to + changing circumstances, more so because scraping is involved.
  • +
  • It needs to be accessible: It should be possible for any developer to build and + contribute to scrapers; not just specialized developers who have spent half their life working on this sort + of thing. That means that the API needs to be simple, and there needs to be space for someone to use the + tools they are comfortable with.
  • +
+

At the time of writing, there's only a datasheet search engine. However, the long-term goal is for SeekSeek + to become a large collection of specialized search engines - each one with a tailor-made UI that's + ideal for the thing being searched through. So all of the above needs to be satisfied not just for a datasheet + search engine, but for a potentially unlimited series of search engines, many of which are not even on + the roadmap yet!

+

And well, the very short version is that none of the existing options that I've evaluated even came + close to meeting these requirements. Existing scraping stacks, job queues, and so on tend to very much + be designed for corporate environments with tight control over who works on what. That wasn't an option + here. So let's talk about what we ended up with instead!

+

The scraping server

+

The core component in SeekSeek is the 'scraping server' - an experimental project called srap that was built specifically for SeekSeek; though also + designed to be more generically useful. You can think of srap as a persistent job queue that's + optimized for scraping.

+

So what does that mean? The basic idea behind srap is that you have a big pile of "items" - each item + isn't much more than a unique identifier and some 'initial data' to represent the work to be done. + Each item can have zero or more 'tags' assigned, which are just short strings. Crucially, none of these + items do anything yet - they're really just a mapping from an identifier to some arbitrarily-shaped + JSON.

+

The real work starts with the scraper configuration. Even though it's called a + 'configuration', it's really more of a codebase - you can find the configuration that + SeekSeek uses here. You'll notice that it defines a number of tasks + and seed items. The seed items are simply inserted automatically if they don't exist yet, and define + the 'starting point' for the scraper.

+

The tasks, however, define what the scraper does. Every task represents one specific operation in the + scraping process; typically, there will be multiple tasks per source. One to find product categories, one to + extract products from a category listing, one to extract data from a product page, and so on. Each of these + tasks has its own concurrency settings, as well as a TTL (Time-To-Live) that defines after how long the scraper + should revisit it.

+

Finally, what wires it all together are the tag mappings. These define what tasks should be executed for + what tags - or more accurately, for all the items that are tagged with those tags. Tags associated with + items are dynamic, they can be added or removed by any scraping task. This provides a huge amount of + flexibility, because any task can essentially queue any other task, just by giving an item the right + tag. The scraping server then makes sure that it lands at the right spot in the queue at the right time - the + task itself doesn't need to care about any of that.

+

Here's a practical example, from the datasheet search tasks:

+
    +
  • The initial seed item for LCSC is tagged as lcsc:home.
  • +
  • The lcsc:home tag is defined to trigger the lcsc:findCategories task.
  • +
  • The lcsc:findCategories task fetches a list of categories from the source, and creates an item + tagged as lcsc:category for each.
  • +
  • The lcsc:category tag is then defined to trigger the lcsc:scrapeCategory task. +
  • +
  • The lcsc:scrapeCategory task (more or less) fetches all the products for a given category, and + creates items tagged as lcsc:product. Importantly, because the LCSC category listings + already include the product data we need, these items are immediately created with their full data + - there's no separate 'scape product page' task!
  • +
  • The lcsc:product tag is then defined to trigger the lcsc:normalizeProduct task. +
  • +
  • The lcsc:normalizeProduct task then converts the scraped data to a standardized representation, + which is stored with a result:datasheet tag. The scraping flows for other data sources + also produce result:datasheet items - these are the items that ultimately end up in + the search frontend!
  • +
+

One thing that's not mentioned above is that lcsc:scrapeCategory doesn't actually + scrape all of the items for a category - it just scrapes a specific page of them! The initial + lcsc:findCategories task would have created as many of such 'page tasks' as there are pages + to scrape, based on the amount of items a category is said to have.

+

More interesting, though, is that the scraping flow doesn't have to be this unidirectional - if the + total amount of pages could only be learned from scraping the first page, it would have been entirely possible + for the lcsc:scrapeCategory task to create additional lcsc:category items! + The tag-based system makes recursive discovery like this a breeze, and because everything is keyed by a unique + identifier and persistent, loops are automatically prevented.

+

You'll probably have noticed that none of the above mentions HTTP requests. That's because srap + doesn't care - it has no idea what HTTP even is! All of the actual scraping logic is completely + defined by the configuration - and that's what makes it a codebase. This is the scraping logic for extracting products from an LCSC category, for example. This is also why each page is + its own item; that allows srap to rate-limit requests despite having absolutely no hooks into the HTTP library + being used, by virtue of limiting each task to 1 HTTP request.

+

There are more features in srap, like deliberately invalidating past scraping results, item merges, and 'out + of band' task result storage, but these are the basic concepts that make the whole thing work. As you can + see, it's highly flexible, unopinionated, and easy to collaboratively maintain a scraper configuration for - + every task functions more or less independently.

+

The datasheet search frontend

+

If you've used the datasheet search, you've probably + noticed that it's really fast, it almost feels like it's all local. But no, your search queries + really are going to a server. So how can it be that fast?

+

It turns out to be surprisingly simple: by default, the search is a prefix search only. That means that + it will only search for items that start with the query you entered. This is usually what you want when you + search for part numbers, and it also has some very interesting performance implications - because a + prefix search can be done entirely on an index!

+

There's actually very little magic here - the PostgreSQL database that runs behind the frontend simply has a + (normalized) index on the column for the part number, and the server is doing a LIKE + 'yourquery%' query against it. That's it! This generally yields a search result in under + 2 milliseconds, ie. nearly instantly. All it has to do is an index lookup, and those are fast.

+

On the browser side, things aren't much more complicated. Every time the query changes, it makes a new search + request to the server, cancelling the old one if one was still in progress. When it gets results, it renders + them on the screen. That's it. There are no trackers on the site, no weird custom input boxes, nothing else + to slow it down. The result is a search that feels local :)

+

The source code

+

Right now, the source code for all of these things lives across three repositories:

+ +

At the time of writing, documentation is still pretty lacking across these repositories, and the code in the srap + and UI repositories in particular is pretty rough! This will be improved upon quite soon, as SeekSeek becomes + more polished.

+ +

Final words

+

Of course, there are many more details that I haven't covered in this post, but hopefully this gives you an idea of how SeekSeek is put together, and why!

+

Has this post made you interested in working on SeekSeek, or maybe your own custom srap-based project? Drop + by in the chat! We'd be happy to give you pointers :)

+
); }; -- 2.40.1 From 0e97559d27ac126401735ffc35e68f7e7bb167ac Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 13:40:44 +0100 Subject: [PATCH 05/10] Cleanup of CSS. All units expressed in `rem`, invert logo in prefers-dark, padding around the footer. --- public/css/style.css | 22 +++++++++++++--------- src/css/style.css | 22 +++++++++++++--------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index 0a9b3f6..8927e5b 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -3,6 +3,8 @@ --color-text: #000000; --color-text-subdued: #4f4e4e; --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 0; } @media (prefers-color-scheme: dark) { @@ -10,7 +12,8 @@ --color-background: #111111; --color-text: #f9f9f9; --color-text-subdued: #cccccc; - --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 1; } } @@ -38,22 +41,23 @@ body { grid-template: "header" 7rem "content" 1fr - "footer" 50px; + "footer" 2rem; } header { grid-area: header; - border-bottom: 2px solid black; - align-self: center; + border-bottom: 1px solid var(--color-text-subdued); } header .logoContainer { display: inline-block; position: relative; + margin: 1rem 0; } header .logoContainer .logo { height: 5rem; + filter: invert(var(--invert)); } header .logoContainer .siteTag { @@ -68,12 +72,12 @@ header .logoContainer .siteTag { header .logoContainer .betaTag { width: 1px; /* Out-of-box alignment hack */ position: absolute; - right: -.3em; + right: -.3rem; bottom: 0; font-style: italic; - color: rgb(218, 13, 13); - font-size: 1.3em; + color: var(--color-text-warning); + font-size: 1.3rem; } main { @@ -85,8 +89,8 @@ main div.search input { width: 100%; } footer { grid-area: footer; - border-top: 1px solid black; - height: 2em; + line-height: 2rem; + border-top: 1px solid var(--color-text-subdued); } main a, footer a { diff --git a/src/css/style.css b/src/css/style.css index ac58ff6..779dbf3 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -3,6 +3,8 @@ --color-text: #000000; --color-text-subdued: #4f4e4e; --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 0; } @media (prefers-color-scheme: dark) { @@ -10,7 +12,8 @@ --color-background: #111111; --color-text: #f9f9f9; --color-text-subdued: #cccccc; - --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 1; } } @@ -38,20 +41,21 @@ body { grid-template: "header" 7rem "content" 1fr - "footer" 50px; + "footer" 2rem; } header { grid-area: header; - border-bottom: 2px solid black; - align-self: center; + border-bottom: 1px solid var(--color-text-subdued); .logoContainer { display: inline-block; position: relative; + margin: 1rem 0; .logo { height: 5rem; + filter: invert(var(--invert)); } .siteTag { @@ -66,12 +70,12 @@ header { .betaTag { width: 1px; /* Out-of-box alignment hack */ position: absolute; - right: -.3em; + right: -.3rem; bottom: 0; font-style: italic; - color: rgb(218, 13, 13); - font-size: 1.3em; + color: var(--color-text-warning); + font-size: 1.3rem; } } } @@ -87,8 +91,8 @@ main { footer { grid-area: footer; - border-top: 1px solid black; - height: 2em; + line-height: 2rem; + border-top: 1px solid var(--color-text-subdued); } main, footer { -- 2.40.1 From 363accb91ae6cf3d6c0a5598ae5d00034c2139df Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 13:42:43 +0100 Subject: [PATCH 06/10] Play with the background for prefers-dark. --- public/css/style.css | 2 +- src/css/style.css | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index 8927e5b..e940cf8 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -9,7 +9,7 @@ @media (prefers-color-scheme: dark) { :root { - --color-background: #111111; + --color-background: #222222; --color-text: #f9f9f9; --color-text-subdued: #cccccc; --color-text-warning: rgb(218, 13, 13); diff --git a/src/css/style.css b/src/css/style.css index 779dbf3..7747f7d 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -9,7 +9,7 @@ @media (prefers-color-scheme: dark) { :root { - --color-background: #111111; + --color-background: #222222; --color-text: #f9f9f9; --color-text-subdued: #cccccc; --color-text-warning: rgb(218, 13, 13); -- 2.40.1 From 0f46a4abc9f06d95158596d894b1eb9854d79d4d Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 13:50:34 +0100 Subject: [PATCH 07/10] Left-align content. Aligning the content to the left allows for a more consistent experience while resizing. --- public/css/style.css | 8 +++----- src/css/style.css | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index e940cf8..0fd6feb 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -25,18 +25,17 @@ html, body { color: var(--color-text); background: var(--color-background); font-family: sans-serif; - padding: 0; min-height: 100%; + margin: 0; } html { - margin: 0; height: 100%; } body { + padding: .5rem; height: 100%; - margin: 0 auto; display: grid; grid-template: "header" 7rem @@ -117,13 +116,12 @@ article { @media only screen and (max-width: 1000px) { body { width: 100%; - padding: 0 1rem; } } @media only screen and (min-width: 1000px) { body { - width: 80%; + width: 70%; } } diff --git a/src/css/style.css b/src/css/style.css index 7747f7d..d821624 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -25,18 +25,17 @@ html, body { color: var(--color-text); background: var(--color-background); font-family: sans-serif; - padding: 0; min-height: 100%; + margin: 0; } html { - margin: 0; height: 100%; } body { + padding: .5rem; height: 100%; - margin: 0 auto; display: grid; grid-template: "header" 7rem @@ -121,13 +120,12 @@ article { @media only screen and (max-width: 1000px) { body { width: 100%; - padding: 0 1rem; } } @media only screen and (min-width: 1000px) { body { - width: 80%; + width: 70%; } } -- 2.40.1 From 3bdbde1c2f5ed55837aa6d9631a374c9390b0773 Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 14:00:07 +0100 Subject: [PATCH 08/10] Propagate CSS variables. CSS variables are used for the prefers-dark, propagate them to component CSS as well. --- public/css/style.css | 6 +++- src/css/style.css | 6 +++- src/frontend/components/datasheet-search.css | 37 ++++++++++---------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index 0fd6feb..ddf1085 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -1,6 +1,8 @@ :root { --color-background: #f9f9f9; + --color-background-attention: #ffffff; --color-text: #000000; + --color-text-inverted: #ffffff; --color-text-subdued: #4f4e4e; --color-text-attention: teal; --color-text-warning: rgb(218, 13, 13); @@ -10,8 +12,10 @@ @media (prefers-color-scheme: dark) { :root { --color-background: #222222; + --color-background-attention: #000000; --color-text: #f9f9f9; - --color-text-subdued: #cccccc; + --color-text-inverted: #ffffff; + --color-text-subdued: #f1f1f1; --color-text-warning: rgb(218, 13, 13); --invert: 1; } diff --git a/src/css/style.css b/src/css/style.css index d821624..49939a4 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -1,6 +1,8 @@ :root { --color-background: #f9f9f9; + --color-background-attention: #ffffff; --color-text: #000000; + --color-text-inverted: #ffffff; --color-text-subdued: #4f4e4e; --color-text-attention: teal; --color-text-warning: rgb(218, 13, 13); @@ -10,8 +12,10 @@ @media (prefers-color-scheme: dark) { :root { --color-background: #222222; + --color-background-attention: #000000; --color-text: #f9f9f9; - --color-text-subdued: #cccccc; + --color-text-inverted: #ffffff; + --color-text-subdued: #f1f1f1; --color-text-warning: rgb(218, 13, 13); --invert: 1; } diff --git a/src/frontend/components/datasheet-search.css b/src/frontend/components/datasheet-search.css index 60320ae..a361111 100644 --- a/src/frontend/components/datasheet-search.css +++ b/src/frontend/components/datasheet-search.css @@ -1,39 +1,38 @@ .query { - font-size: 1.3em; - background-color: white; - color: black; - border: 1px solid teal; + font-size: 1.3rem; + background-color: var(--color-background-autention); + color: var(--color-text); + border: 1px solid var(--color-text-attention); border-radius: 5px; - padding: .4em .8em; + padding: .4rem .8rem; width: 100%; box-sizing: border-box; } .noResults { - padding: .6em 1em; + padding: .6rem 1rem; } .results { - border: 1px solid teal; + border: 1px solid var(--color-text-attention); border-radius: 5px; - /* margin-top: 1em; */ .result { - border-top: 1px solid teal; - padding: .6em 1em; - font-size: .8em; + border-top: 1px solid var(--color-text-attention); + padding: .6rem 1rem; + font-size: .8rem; &.first { border-top: 0; - font-size: 1em + font-size: 1rem } .name { - font-size: 1.3em; + font-size: 1.3rem; .manufacturer { - color: #4f4e4e; - margin-right: .3em; + color: var(--color-text-subdued); + margin-right: .3rem; } .model { @@ -42,7 +41,7 @@ } .description { - color: #4f4e4e; + color: var(--color-text-subdued); margin-top: .2em; } @@ -55,9 +54,9 @@ margin-top: -.3em; margin-right: -.7em; - border: 1px solid #006262; - background-color: #039f9f; - color: white; + border: 1px solid var(--color-text); + background-color: var(--color-text-attention); + color: var(--color-text-inverted); } } } -- 2.40.1 From 1f0ba04aca8cee32a03f3730ec66f87c46037b9f Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 14:28:51 +0100 Subject: [PATCH 09/10] Bad editor. Bad. Fix mixed indentation and removal of tabs. --- src/css/style.css | 150 +++++++++---------- src/views/_layout.jsx | 32 ++-- src/views/technology.jsx | 312 +++++++++++++++++++-------------------- 3 files changed, 247 insertions(+), 247 deletions(-) diff --git a/src/css/style.css b/src/css/style.css index 49939a4..cda8b18 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -1,113 +1,113 @@ :root { - --color-background: #f9f9f9; - --color-background-attention: #ffffff; - --color-text: #000000; - --color-text-inverted: #ffffff; - --color-text-subdued: #4f4e4e; - --color-text-attention: teal; - --color-text-warning: rgb(218, 13, 13); - --invert: 0; + --color-background: #f9f9f9; + --color-background-attention: #ffffff; + --color-text: #000000; + --color-text-inverted: #ffffff; + --color-text-subdued: #4f4e4e; + --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 0; } @media (prefers-color-scheme: dark) { - :root { - --color-background: #222222; - --color-background-attention: #000000; - --color-text: #f9f9f9; - --color-text-inverted: #ffffff; - --color-text-subdued: #f1f1f1; - --color-text-warning: rgb(218, 13, 13); - --invert: 1; - } + :root { + --color-background: #222222; + --color-background-attention: #000000; + --color-text: #f9f9f9; + --color-text-inverted: #ffffff; + --color-text-subdued: #f1f1f1; + --color-text-warning: rgb(218, 13, 13); + --invert: 1; + } } * { - box-sizing: border-box; + box-sizing: border-box; } html, body { - color: var(--color-text); + color: var(--color-text); background: var(--color-background); font-family: sans-serif; - min-height: 100%; - margin: 0; + min-height: 100%; + margin: 0; } html { - height: 100%; + height: 100%; } body { - padding: .5rem; - height: 100%; - display: grid; - grid-template: - "header" 7rem - "content" 1fr - "footer" 2rem; + padding: .5rem; + height: 100%; + display: grid; + grid-template: + "header" 7rem + "content" 1fr + "footer" 2rem; } header { - grid-area: header; + grid-area: header; border-bottom: 1px solid var(--color-text-subdued); - .logoContainer { - display: inline-block; - position: relative; - margin: 1rem 0; + .logoContainer { + display: inline-block; + position: relative; + margin: 1rem 0; - .logo { - height: 5rem; - filter: invert(var(--invert)); - } + .logo { + height: 5rem; + filter: invert(var(--invert)); + } - .siteTag { - position: absolute; - right: 0; - bottom: 0; + .siteTag { + position: absolute; + right: 0; + bottom: 0; - color: var(--color-text-attention); - font-size: 1.3em; - } + color: var(--color-text-attention); + font-size: 1.3em; + } - .betaTag { - width: 1px; /* Out-of-box alignment hack */ - position: absolute; - right: -.3rem; - bottom: 0; - font-style: italic; + .betaTag { + width: 1px; /* Out-of-box alignment hack */ + position: absolute; + right: -.3rem; + bottom: 0; + font-style: italic; - color: var(--color-text-warning); - font-size: 1.3rem; - } - } + color: var(--color-text-warning); + font-size: 1.3rem; + } + } } main { - grid-area: content; + grid-area: content; margin-bottom: 2em; - div.search { - input { width: 100%; } - } + div.search { + input { width: 100%; } + } } footer { - grid-area: footer; - line-height: 2rem; + grid-area: footer; + line-height: 2rem; border-top: 1px solid var(--color-text-subdued); } main, footer { - a { - color: var(--color-text-attention); + a { + color: var(--color-text-attention); - &:hover { text-decoration: none; } - } + &:hover { text-decoration: none; } + } } article { - line-height: 1.25rem; + line-height: 1.25rem; } .counter { @@ -122,20 +122,20 @@ article { } @media only screen and (max-width: 1000px) { - body { - width: 100%; - } + body { + width: 100%; + } } @media only screen and (min-width: 1000px) { - body { - width: 70%; - } + body { + width: 70%; + } } @media only screen and (min-width: 1400px) { - body { - width: 60rem; - } + body { + width: 60rem; + } } diff --git a/src/views/_layout.jsx b/src/views/_layout.jsx index 5d63390..f48b265 100644 --- a/src/views/_layout.jsx +++ b/src/views/_layout.jsx @@ -14,27 +14,27 @@ module.exports = function Layout({ children }) {
- - seekseek logo - datasheets - beta - + + seekseek logo + datasheets + beta +
{children}
diff --git a/src/views/technology.jsx b/src/views/technology.jsx index e91ce13..d42c919 100644 --- a/src/views/technology.jsx +++ b/src/views/technology.jsx @@ -7,163 +7,163 @@ const Layout = require("./_layout"); module.exports = function Technology() { return ( -
-

The technology

+
+

The technology

-

So... what makes SeekSeek tick? Let's get the boring bits out of the way first:

-
    -
  • The whole thing is written in Javascript, end-to-end, including the scraper.
  • -
  • Both the scraping server and the search frontend server run on NixOS.
  • -
  • PostgreSQL is used as the database, both for the scraper and the search frontends (there's only one - frontend the time of writing).
  • -
  • The search frontends use React for rendering the UI; server-side where possible, browser-side where - necessary.
  • -
  • Server-side rendering is done with a fork of express-react-views.
  • -
  • Most scraping tasks use bhttp as the HTTP client, and cheerio (a 'headless' implementation - of the jQuery API) for data extraction.
  • -
-

None of that is really very interesting, but people always ask about it. Let's move on to the interesting - bits!

-

The goal

-

Before we can talk about the technology, we need to talk about what the technology was built for. - SeekSeek is radical software. From the ground up, it was - designed to be FOSS, collaborative and community-driven, non-commercial, ad-free, and to improve the world - in - the case of SeekSeek specifically, to improve on the poor state of keyword-only searches by providing highly - specialized search engines instead!

-

But... that introduces some unusual requirements:

-
    -
  • It needs to be resource-conservative: While it doesn't need to be perfectly optimized, it shouldn't require absurd amounts of RAM or CPU power either. It should be possible to run - the whole thing on a desktop or a cheap server - the usual refrain of "extra servers are - cheaper than extra developers", a very popular one in startups, does not apply here.
  • -
  • It needs to be easy to spin up for development: The entire codebase needs to be - self-contained as much as reasonably possible, requiring not much more than an npm install to - get everything in place. No weirdly complex build stacks, no assumptions about how the developer's - system is laid out, and things need to be debuggable by someone who has never touched it before. It needs to - be possible for anybody to hack on it, not just a bunch of core developers.
  • -
  • It needs to be easy to deploy and maintain: It needs to work with commodity software on - standard operating systems, including in constrained environments like containers and VPSes. No weird kernel - settings, no complex network setup requirements. It needs to Just Work, and to keep working with - very little maintenance. Upgrades need to be seamless.
  • -
  • It needs to be flexible: Time is still a valuable resource in a collaborative project - - unlike a company, we can't assume that someone will be able to spend a working day restructuring the - entire codebase. Likewise, fundamental restructuring causes coordination issues across the community, - because a FOSS community is not a centralized entity with a manager who decides what happens. That means - that the core (extensible) architecture needs to be right from the start, and able to adapt to - changing circumstances, more so because scraping is involved.
  • -
  • It needs to be accessible: It should be possible for any developer to build and - contribute to scrapers; not just specialized developers who have spent half their life working on this sort - of thing. That means that the API needs to be simple, and there needs to be space for someone to use the - tools they are comfortable with.
  • -
-

At the time of writing, there's only a datasheet search engine. However, the long-term goal is for SeekSeek - to become a large collection of specialized search engines - each one with a tailor-made UI that's - ideal for the thing being searched through. So all of the above needs to be satisfied not just for a datasheet - search engine, but for a potentially unlimited series of search engines, many of which are not even on - the roadmap yet!

-

And well, the very short version is that none of the existing options that I've evaluated even came - close to meeting these requirements. Existing scraping stacks, job queues, and so on tend to very much - be designed for corporate environments with tight control over who works on what. That wasn't an option - here. So let's talk about what we ended up with instead!

-

The scraping server

-

The core component in SeekSeek is the 'scraping server' - an experimental project called srap that was built specifically for SeekSeek; though also - designed to be more generically useful. You can think of srap as a persistent job queue that's - optimized for scraping.

-

So what does that mean? The basic idea behind srap is that you have a big pile of "items" - each item - isn't much more than a unique identifier and some 'initial data' to represent the work to be done. - Each item can have zero or more 'tags' assigned, which are just short strings. Crucially, none of these - items do anything yet - they're really just a mapping from an identifier to some arbitrarily-shaped - JSON.

-

The real work starts with the scraper configuration. Even though it's called a - 'configuration', it's really more of a codebase - you can find the configuration that - SeekSeek uses here. You'll notice that it defines a number of tasks - and seed items. The seed items are simply inserted automatically if they don't exist yet, and define - the 'starting point' for the scraper.

-

The tasks, however, define what the scraper does. Every task represents one specific operation in the - scraping process; typically, there will be multiple tasks per source. One to find product categories, one to - extract products from a category listing, one to extract data from a product page, and so on. Each of these - tasks has its own concurrency settings, as well as a TTL (Time-To-Live) that defines after how long the scraper - should revisit it.

-

Finally, what wires it all together are the tag mappings. These define what tasks should be executed for - what tags - or more accurately, for all the items that are tagged with those tags. Tags associated with - items are dynamic, they can be added or removed by any scraping task. This provides a huge amount of - flexibility, because any task can essentially queue any other task, just by giving an item the right - tag. The scraping server then makes sure that it lands at the right spot in the queue at the right time - the - task itself doesn't need to care about any of that.

-

Here's a practical example, from the datasheet search tasks:

-
    -
  • The initial seed item for LCSC is tagged as lcsc:home.
  • -
  • The lcsc:home tag is defined to trigger the lcsc:findCategories task.
  • -
  • The lcsc:findCategories task fetches a list of categories from the source, and creates an item - tagged as lcsc:category for each.
  • -
  • The lcsc:category tag is then defined to trigger the lcsc:scrapeCategory task. -
  • -
  • The lcsc:scrapeCategory task (more or less) fetches all the products for a given category, and - creates items tagged as lcsc:product. Importantly, because the LCSC category listings - already include the product data we need, these items are immediately created with their full data - - there's no separate 'scape product page' task!
  • -
  • The lcsc:product tag is then defined to trigger the lcsc:normalizeProduct task. -
  • -
  • The lcsc:normalizeProduct task then converts the scraped data to a standardized representation, - which is stored with a result:datasheet tag. The scraping flows for other data sources - also produce result:datasheet items - these are the items that ultimately end up in - the search frontend!
  • -
-

One thing that's not mentioned above is that lcsc:scrapeCategory doesn't actually - scrape all of the items for a category - it just scrapes a specific page of them! The initial - lcsc:findCategories task would have created as many of such 'page tasks' as there are pages - to scrape, based on the amount of items a category is said to have.

-

More interesting, though, is that the scraping flow doesn't have to be this unidirectional - if the - total amount of pages could only be learned from scraping the first page, it would have been entirely possible - for the lcsc:scrapeCategory task to create additional lcsc:category items! - The tag-based system makes recursive discovery like this a breeze, and because everything is keyed by a unique - identifier and persistent, loops are automatically prevented.

-

You'll probably have noticed that none of the above mentions HTTP requests. That's because srap - doesn't care - it has no idea what HTTP even is! All of the actual scraping logic is completely - defined by the configuration - and that's what makes it a codebase. This is the scraping logic for extracting products from an LCSC category, for example. This is also why each page is - its own item; that allows srap to rate-limit requests despite having absolutely no hooks into the HTTP library - being used, by virtue of limiting each task to 1 HTTP request.

-

There are more features in srap, like deliberately invalidating past scraping results, item merges, and 'out - of band' task result storage, but these are the basic concepts that make the whole thing work. As you can - see, it's highly flexible, unopinionated, and easy to collaboratively maintain a scraper configuration for - - every task functions more or less independently.

-

The datasheet search frontend

-

If you've used the datasheet search, you've probably - noticed that it's really fast, it almost feels like it's all local. But no, your search queries - really are going to a server. So how can it be that fast?

-

It turns out to be surprisingly simple: by default, the search is a prefix search only. That means that - it will only search for items that start with the query you entered. This is usually what you want when you - search for part numbers, and it also has some very interesting performance implications - because a - prefix search can be done entirely on an index!

-

There's actually very little magic here - the PostgreSQL database that runs behind the frontend simply has a - (normalized) index on the column for the part number, and the server is doing a LIKE - 'yourquery%' query against it. That's it! This generally yields a search result in under - 2 milliseconds, ie. nearly instantly. All it has to do is an index lookup, and those are fast.

-

On the browser side, things aren't much more complicated. Every time the query changes, it makes a new search - request to the server, cancelling the old one if one was still in progress. When it gets results, it renders - them on the screen. That's it. There are no trackers on the site, no weird custom input boxes, nothing else - to slow it down. The result is a search that feels local :)

-

The source code

-

Right now, the source code for all of these things lives across three repositories:

- -

At the time of writing, documentation is still pretty lacking across these repositories, and the code in the srap - and UI repositories in particular is pretty rough! This will be improved upon quite soon, as SeekSeek becomes - more polished.

- -

Final words

-

Of course, there are many more details that I haven't covered in this post, but hopefully this gives you an idea of how SeekSeek is put together, and why!

-

Has this post made you interested in working on SeekSeek, or maybe your own custom srap-based project? Drop - by in the chat! We'd be happy to give you pointers :)

-
+

So... what makes SeekSeek tick? Let's get the boring bits out of the way first:

+
    +
  • The whole thing is written in Javascript, end-to-end, including the scraper.
  • +
  • Both the scraping server and the search frontend server run on NixOS.
  • +
  • PostgreSQL is used as the database, both for the scraper and the search frontends (there's only one + frontend the time of writing).
  • +
  • The search frontends use React for rendering the UI; server-side where possible, browser-side where + necessary.
  • +
  • Server-side rendering is done with a fork of express-react-views.
  • +
  • Most scraping tasks use bhttp as the HTTP client, and cheerio (a 'headless' implementation + of the jQuery API) for data extraction.
  • +
+

None of that is really very interesting, but people always ask about it. Let's move on to the interesting + bits!

+

The goal

+

Before we can talk about the technology, we need to talk about what the technology was built for. + SeekSeek is radical software. From the ground up, it was + designed to be FOSS, collaborative and community-driven, non-commercial, ad-free, and to improve the world - in + the case of SeekSeek specifically, to improve on the poor state of keyword-only searches by providing highly + specialized search engines instead!

+

But... that introduces some unusual requirements:

+
    +
  • It needs to be resource-conservative: While it doesn't need to be perfectly optimized, it shouldn't require absurd amounts of RAM or CPU power either. It should be possible to run + the whole thing on a desktop or a cheap server - the usual refrain of "extra servers are + cheaper than extra developers", a very popular one in startups, does not apply here.
  • +
  • It needs to be easy to spin up for development: The entire codebase needs to be + self-contained as much as reasonably possible, requiring not much more than an npm install to + get everything in place. No weirdly complex build stacks, no assumptions about how the developer's + system is laid out, and things need to be debuggable by someone who has never touched it before. It needs to + be possible for anybody to hack on it, not just a bunch of core developers.
  • +
  • It needs to be easy to deploy and maintain: It needs to work with commodity software on + standard operating systems, including in constrained environments like containers and VPSes. No weird kernel + settings, no complex network setup requirements. It needs to Just Work, and to keep working with + very little maintenance. Upgrades need to be seamless.
  • +
  • It needs to be flexible: Time is still a valuable resource in a collaborative project - + unlike a company, we can't assume that someone will be able to spend a working day restructuring the + entire codebase. Likewise, fundamental restructuring causes coordination issues across the community, + because a FOSS community is not a centralized entity with a manager who decides what happens. That means + that the core (extensible) architecture needs to be right from the start, and able to adapt to + changing circumstances, more so because scraping is involved.
  • +
  • It needs to be accessible: It should be possible for any developer to build and + contribute to scrapers; not just specialized developers who have spent half their life working on this sort + of thing. That means that the API needs to be simple, and there needs to be space for someone to use the + tools they are comfortable with.
  • +
+

At the time of writing, there's only a datasheet search engine. However, the long-term goal is for SeekSeek + to become a large collection of specialized search engines - each one with a tailor-made UI that's + ideal for the thing being searched through. So all of the above needs to be satisfied not just for a datasheet + search engine, but for a potentially unlimited series of search engines, many of which are not even on + the roadmap yet!

+

And well, the very short version is that none of the existing options that I've evaluated even came + close to meeting these requirements. Existing scraping stacks, job queues, and so on tend to very much + be designed for corporate environments with tight control over who works on what. That wasn't an option + here. So let's talk about what we ended up with instead!

+

The scraping server

+

The core component in SeekSeek is the 'scraping server' - an experimental project called srap that was built specifically for SeekSeek; though also + designed to be more generically useful. You can think of srap as a persistent job queue that's + optimized for scraping.

+

So what does that mean? The basic idea behind srap is that you have a big pile of "items" - each item + isn't much more than a unique identifier and some 'initial data' to represent the work to be done. + Each item can have zero or more 'tags' assigned, which are just short strings. Crucially, none of these + items do anything yet - they're really just a mapping from an identifier to some arbitrarily-shaped + JSON.

+

The real work starts with the scraper configuration. Even though it's called a + 'configuration', it's really more of a codebase - you can find the configuration that + SeekSeek uses here. You'll notice that it defines a number of tasks + and seed items. The seed items are simply inserted automatically if they don't exist yet, and define + the 'starting point' for the scraper.

+

The tasks, however, define what the scraper does. Every task represents one specific operation in the + scraping process; typically, there will be multiple tasks per source. One to find product categories, one to + extract products from a category listing, one to extract data from a product page, and so on. Each of these + tasks has its own concurrency settings, as well as a TTL (Time-To-Live) that defines after how long the scraper + should revisit it.

+

Finally, what wires it all together are the tag mappings. These define what tasks should be executed for + what tags - or more accurately, for all the items that are tagged with those tags. Tags associated with + items are dynamic, they can be added or removed by any scraping task. This provides a huge amount of + flexibility, because any task can essentially queue any other task, just by giving an item the right + tag. The scraping server then makes sure that it lands at the right spot in the queue at the right time - the + task itself doesn't need to care about any of that.

+

Here's a practical example, from the datasheet search tasks:

+
    +
  • The initial seed item for LCSC is tagged as lcsc:home.
  • +
  • The lcsc:home tag is defined to trigger the lcsc:findCategories task.
  • +
  • The lcsc:findCategories task fetches a list of categories from the source, and creates an item + tagged as lcsc:category for each.
  • +
  • The lcsc:category tag is then defined to trigger the lcsc:scrapeCategory task. +
  • +
  • The lcsc:scrapeCategory task (more or less) fetches all the products for a given category, and + creates items tagged as lcsc:product. Importantly, because the LCSC category listings + already include the product data we need, these items are immediately created with their full data + - there's no separate 'scape product page' task!
  • +
  • The lcsc:product tag is then defined to trigger the lcsc:normalizeProduct task. +
  • +
  • The lcsc:normalizeProduct task then converts the scraped data to a standardized representation, + which is stored with a result:datasheet tag. The scraping flows for other data sources + also produce result:datasheet items - these are the items that ultimately end up in + the search frontend!
  • +
+

One thing that's not mentioned above is that lcsc:scrapeCategory doesn't actually + scrape all of the items for a category - it just scrapes a specific page of them! The initial + lcsc:findCategories task would have created as many of such 'page tasks' as there are pages + to scrape, based on the amount of items a category is said to have.

+

More interesting, though, is that the scraping flow doesn't have to be this unidirectional - if the + total amount of pages could only be learned from scraping the first page, it would have been entirely possible + for the lcsc:scrapeCategory task to create additional lcsc:category items! + The tag-based system makes recursive discovery like this a breeze, and because everything is keyed by a unique + identifier and persistent, loops are automatically prevented.

+

You'll probably have noticed that none of the above mentions HTTP requests. That's because srap + doesn't care - it has no idea what HTTP even is! All of the actual scraping logic is completely + defined by the configuration - and that's what makes it a codebase. This is the scraping logic for extracting products from an LCSC category, for example. This is also why each page is + its own item; that allows srap to rate-limit requests despite having absolutely no hooks into the HTTP library + being used, by virtue of limiting each task to 1 HTTP request.

+

There are more features in srap, like deliberately invalidating past scraping results, item merges, and 'out + of band' task result storage, but these are the basic concepts that make the whole thing work. As you can + see, it's highly flexible, unopinionated, and easy to collaboratively maintain a scraper configuration for - + every task functions more or less independently.

+

The datasheet search frontend

+

If you've used the datasheet search, you've probably + noticed that it's really fast, it almost feels like it's all local. But no, your search queries + really are going to a server. So how can it be that fast?

+

It turns out to be surprisingly simple: by default, the search is a prefix search only. That means that + it will only search for items that start with the query you entered. This is usually what you want when you + search for part numbers, and it also has some very interesting performance implications - because a + prefix search can be done entirely on an index!

+

There's actually very little magic here - the PostgreSQL database that runs behind the frontend simply has a + (normalized) index on the column for the part number, and the server is doing a LIKE + 'yourquery%' query against it. That's it! This generally yields a search result in under + 2 milliseconds, ie. nearly instantly. All it has to do is an index lookup, and those are fast.

+

On the browser side, things aren't much more complicated. Every time the query changes, it makes a new search + request to the server, cancelling the old one if one was still in progress. When it gets results, it renders + them on the screen. That's it. There are no trackers on the site, no weird custom input boxes, nothing else + to slow it down. The result is a search that feels local :)

+

The source code

+

Right now, the source code for all of these things lives across three repositories:

+ +

At the time of writing, documentation is still pretty lacking across these repositories, and the code in the srap + and UI repositories in particular is pretty rough! This will be improved upon quite soon, as SeekSeek becomes + more polished.

+ +

Final words

+

Of course, there are many more details that I haven't covered in this post, but hopefully this gives you an idea of how SeekSeek is put together, and why!

+

Has this post made you interested in working on SeekSeek, or maybe your own custom srap-based project? Drop + by in the chat! We'd be happy to give you pointers :)

+
); }; -- 2.40.1 From 13002057f8de89d60432d557d9927cdea677e626 Mon Sep 17 00:00:00 2001 From: supakeen Date: Sat, 5 Mar 2022 17:34:09 +0100 Subject: [PATCH 10/10] Initial Review Changes. A bigger border for the logo, some prettier margins. Remove one of the breakpoints for the screen size. Fix the footer again, but this time within the constraints of 'the body'. --- public/css/style.css | 136 ++++++++++--------- src/css/style.css | 40 ++++-- src/frontend/components/datasheet-search.css | 2 +- 3 files changed, 99 insertions(+), 79 deletions(-) diff --git a/public/css/style.css b/public/css/style.css index ddf1085..16abe19 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -1,109 +1,109 @@ :root { - --color-background: #f9f9f9; - --color-background-attention: #ffffff; - --color-text: #000000; - --color-text-inverted: #ffffff; - --color-text-subdued: #4f4e4e; - --color-text-attention: teal; - --color-text-warning: rgb(218, 13, 13); - --invert: 0; + --color-background: #f9f9f9; + --color-background-attention: #ffffff; + --color-text: #000000; + --color-text-inverted: #ffffff; + --color-text-subdued: #4f4e4e; + --color-text-attention: teal; + --color-text-warning: rgb(218, 13, 13); + --invert: 0; } @media (prefers-color-scheme: dark) { - :root { - --color-background: #222222; - --color-background-attention: #000000; - --color-text: #f9f9f9; - --color-text-inverted: #ffffff; - --color-text-subdued: #f1f1f1; - --color-text-warning: rgb(218, 13, 13); - --invert: 1; - } + :root { + --color-background: #222222; + --color-background-attention: #000000; + --color-text: #f9f9f9; + --color-text-inverted: #ffffff; + --color-text-subdued: #f1f1f1; + --color-text-warning: rgb(218, 13, 13); + --invert: 1; + } } * { - box-sizing: border-box; + box-sizing: border-box; } html, body { - color: var(--color-text); + color: var(--color-text); background: var(--color-background); font-family: sans-serif; - min-height: 100%; - margin: 0; + min-height: 100%; + margin: 0; } html { - height: 100%; + height: 100%; } body { - padding: .5rem; - height: 100%; - display: grid; - grid-template: - "header" 7rem - "content" 1fr - "footer" 2rem; + height: 100%; + display: grid; + grid-template: + "header" 6rem + "content" 1fr; } header { - grid-area: header; - border-bottom: 1px solid var(--color-text-subdued); + grid-area: header; + border-bottom: 2px solid var(--color-text-subdued); } header .logoContainer { - display: inline-block; - position: relative; - margin: 1rem 0; - } + display: inline-block; + position: relative; + } header .logoContainer .logo { - height: 5rem; - filter: invert(var(--invert)); - } + height: 5rem; + filter: invert(var(--invert)); + } header .logoContainer .siteTag { - position: absolute; - right: 0; - bottom: 0; + position: absolute; + right: 0; + bottom: 0; - color: var(--color-text-attention); - font-size: 1.3em; - } + color: var(--color-text-attention); + font-size: 1.3em; + } header .logoContainer .betaTag { - width: 1px; /* Out-of-box alignment hack */ - position: absolute; - right: -.3rem; - bottom: 0; - font-style: italic; + width: 1px; /* Out-of-box alignment hack */ + position: absolute; + right: -.3rem; + bottom: 0; + font-style: italic; - color: var(--color-text-warning); - font-size: 1.3rem; - } + color: var(--color-text-warning); + font-size: 1.3rem; + } main { - grid-area: content; - margin-bottom: 2em; + grid-area: content; + margin: .5rem 0; + padding: 0 0 2rem 0; } main div.search input { width: 100%; } footer { - grid-area: footer; - line-height: 2rem; + position: fixed; + bottom: 0; + line-height: 2rem; border-top: 1px solid var(--color-text-subdued); + background: var(--color-background); } main a, footer a { - color: var(--color-text-attention); - } + color: var(--color-text-attention); + } main a:hover, footer a:hover { text-decoration: none; } article { - line-height: 1.25rem; + line-height: 1.25rem; } .counter { @@ -121,17 +121,27 @@ article { body { width: 100%; } -} -@media only screen and (min-width: 1000px) { - body { - width: 70%; + footer { + width: calc(100% - 1rem); } + + body { + padding: 1rem .5rem 0 .5rem; + } } @media only screen and (min-width: 1400px) { body { width: 60rem; } + + footer { + width: calc(60rem - 4rem); + } + + body { + padding: 1rem 2rem 0 2rem; + } } diff --git a/src/css/style.css b/src/css/style.css index cda8b18..e179b64 100644 --- a/src/css/style.css +++ b/src/css/style.css @@ -38,23 +38,20 @@ html { } body { - padding: .5rem; height: 100%; display: grid; grid-template: - "header" 7rem - "content" 1fr - "footer" 2rem; + "header" 6rem + "content" 1fr; } header { grid-area: header; - border-bottom: 1px solid var(--color-text-subdued); + border-bottom: 2px solid var(--color-text-subdued); .logoContainer { display: inline-block; position: relative; - margin: 1rem 0; .logo { height: 5rem; @@ -85,7 +82,8 @@ header { main { grid-area: content; - margin-bottom: 2em; + margin: .5rem 0; + padding: 0 0 2rem 0; div.search { input { width: 100%; } @@ -93,9 +91,11 @@ main { } footer { - grid-area: footer; + position: fixed; + bottom: 0; line-height: 2rem; border-top: 1px solid var(--color-text-subdued); + background: var(--color-background); } main, footer { @@ -122,20 +122,30 @@ article { } @media only screen and (max-width: 1000px) { - body { - width: 100%; - } -} + body { + width: 100%; + } + + footer { + width: calc(100% - 1rem); + } -@media only screen and (min-width: 1000px) { body { - width: 70%; + padding: 1rem .5rem 0 .5rem; } } @media only screen and (min-width: 1400px) { + body { + width: 60rem; + } + + footer { + width: calc(60rem - 4rem); + } + body { - width: 60rem; + padding: 1rem 2rem 0 2rem; } } diff --git a/src/frontend/components/datasheet-search.css b/src/frontend/components/datasheet-search.css index a361111..0b99273 100644 --- a/src/frontend/components/datasheet-search.css +++ b/src/frontend/components/datasheet-search.css @@ -1,6 +1,6 @@ .query { font-size: 1.3rem; - background-color: var(--color-background-autention); + background-color: var(--color-background-attention); color: var(--color-text); border: 1px solid var(--color-text-attention); border-radius: 5px; -- 2.40.1