Sven Slootweg 6 months ago
commit
464853bd7c
  1. 3
      .eslintrc
  2. 1
      .gitignore
  3. 24
      bin/run
  4. 31
      bin/simulate
  5. 6
      config.json
  6. 11
      knexfile.js
  7. 61
      migrations/20210301235340_init.js
  8. 295
      notes.txt
  9. 48
      package.json
  10. 9
      src/errors.js
  11. 71
      src/index.js
  12. 33
      src/initialize.js
  13. 164
      src/kernel.js
  14. 20
      src/models/alias.js
  15. 19
      src/models/failure.js
  16. 17
      src/models/index.js
  17. 45
      src/models/item.js
  18. 29
      src/models/tag.js
  19. 20
      src/models/task-in-progress.js
  20. 20
      src/models/task-result.js
  21. 57
      src/mutation-api/database.js
  22. 77
      src/mutation-api/simulation.js
  23. 73
      src/mutation-api/wrapper.js
  24. 381
      src/queries.js
  25. 47
      src/streams/process-task-safely.js
  26. 154
      src/task-stream.js
  27. 127
      test.sql
  28. 2762
      yarn.lock

3
.eslintrc

@ -0,0 +1,3 @@
{
"extends": "@joepie91/eslint-config"
}

1
.gitignore

@ -0,0 +1 @@
node_modules

24
bin/run

@ -0,0 +1,24 @@
#!/usr/bin/env node
"use strict";
// FIXME: Safe shutdown on ctrl+C
// FIXME: Somehow automatically detect whether other kernels are running on the system, and automatically clear locks when that is not the case?
const Promise = require("bluebird");
const yargs = require("yargs");
const path = require("path");
const createKernel = require("../src/kernel");
let argv = yargs.argv;
let configurationPath = argv._[0];
let absoluteConfigurationPath = path.join(process.cwd(), configurationPath);
let configuration = require(absoluteConfigurationPath);
return Promise.try(() => {
return createKernel(configuration);
}).then((kernel) => {
kernel.run();
});

31
bin/simulate

@ -0,0 +1,31 @@
#!/usr/bin/env node
"use strict";
const Promise = require("bluebird");
const yargs = require("yargs");
const path = require("path");
const createKernel = require("../src/kernel");
const chalk = require("chalk");
let argv = yargs.argv;
let [ configurationPath, task, item ] = argv._;
let absoluteConfigurationPath = path.join(process.cwd(), configurationPath);
let configuration = require(absoluteConfigurationPath);
return Promise.try(() => {
return createKernel(configuration);
}).then((kernel) => {
return Promise.try(() => {
return kernel.simulate({
task: task,
itemID: item
});
}).then(() => {
console.log(chalk.green.bold("Done!"));
}).finally(() => {
kernel.shutdown();
});
});

6
config.json

@ -0,0 +1,6 @@
{
"database": {
"socketPath": "/run/postgresql",
"database": "scrapingserver"
}
}

11
knexfile.js

@ -0,0 +1,11 @@
"use strict";
const config = require("./config.json");
module.exports = {
client: "pg",
connection: {
host: config.database.socketPath,
database: config.database.database
}
};

61
migrations/20210301235340_init.js

@ -0,0 +1,61 @@
"use strict";
module.exports.up = function(knex, Promise) {
return knex.schema
.createTable("items", (table) => {
// NOTE: The id is the primary name for the item
table.text("id").notNullable().primary();
table.jsonb("data").notNullable();
// NOTE: created_by references an alias, and it is allowed for this to be a broken/dead reference!
table.text("created_by");
table.timestamp("created_at").notNullable().defaultTo(knex.fn.now());
table.timestamp("updated_at").notNullable(); // FIXME: Maybe should be nullable?
table.timestamp("metadata_updated_at");
})
.createTable("aliases", (table) => {
table.text("alias").notNullable().primary();
table.text("item_id").references("items.id").notNullable().onUpdate("CASCADE").onDelete("CASCADE");
})
.createTable("tags", (table) => {
table.bigIncrements("id").primary();
table.text("item_id").references("items.id").notNullable().onUpdate("CASCADE").onDelete("CASCADE");
table.text("name").notNullable().index();
})
.createTable("task_results", (table) => {
table.primary([ "task", "item_id" ]);
table.text("task").notNullable();
table.text("item_id").references("items.id").notNullable().onUpdate("CASCADE").onDelete("CASCADE");
table.text("task_version").notNullable();
table.jsonb("metadata").notNullable();
table.boolean("is_successful").notNullable();
table.boolean("is_invalidated").notNullable().defaultTo(false);
table.timestamp("updated_at").notNullable().defaultTo(knex.fn.now());
table.timestamp("expires_at");
})
.createTable("tasks_in_progress", (table) => {
table.primary([ "task", "item_id" ]);
table.text("task").notNullable();
table.text("item_id").references("items.id").notNullable().onUpdate("CASCADE").onDelete("CASCADE");
table.timestamp("started_at").notNullable().defaultTo(knex.fn.now());
})
.createTable("failures", (table) => {
table.bigIncrements("id").primary();
table.text("task").notNullable();
table.text("item_id").notNullable();
table.timestamp("occurred_at").notNullable().defaultTo(knex.fn.now());
table.foreign([ "task", "item_id" ])
.references([ "task", "item_id" ])
.inTable("task_results");
});
}
module.exports.down = function(knex, Promise) {
return knex.schema
.dropTable("failures")
.dropTable("tasks_in_progress")
.dropTable("task_results")
.dropTable("tags")
.dropTable("aliases")
.dropTable("items");
}

295
notes.txt

@ -0,0 +1,295 @@
FIXME:
- maximumFailure threshold for a task type, after which the task gets stopped until the issue is resolved, to avoid unnecessary requests when eg. the parsing after it fails
- log [updated], not just [new], and also other operations
- add simulation for addData, addMetadata
- task dependencies... CTE for tasks listing their completed-and-valid status, and then use that for both selection of to-do tasks and excluding those where dependencies have not been satisfied yet?
- each item has zero or more tags
- tags may (but do not have to) contain a value
- tags define which processing tasks must be invoked for an item
- tasks have full access to the item data, and may request its metadata for other tasks (though there is no guarantee of order of execution -- maybe there should be?)
- task results may be any of the following: (OUTDATED: see API methods below instead)
- data updates (in which case they are merged into the item, using a user-specified merging callback)
- when using a TTL, data updates are useful for data which is allowed to go stale but should eventually be updated
- revisions are NOT kept for data merges! data overrides are permanent and final.
- new items (in which case the new items are created, but only if the specified ID does not yet exist)
- external item upserts (in which case the original item is fetched, if any, and passed to a user-specified merging callback, defaulting to empty object if nonexistent)
- metadata (in which case the task result is stored separately, addressable by a combination of item ID + task name)
- when using a TTL, metadata is useful for data which should only be taken into consideration for as long as it is fresh (as the separate storage allows for identifying the specific data that has expired)
- likewise, expired task result metadata is not kept; it's overwritten with the new metadata instead
- "delete item" (mutually exclusive with metadata and data updates) -- meant to be used for ephemeral items, like pagination URLs in a category listing
- "expire task results" (addressed by ID + operation, default to own ID for "we immediately know this data is stale but we want to prioritize un-processed items first" cases? as well as for "retry because this failed" cases, which are always explicit)
- "create alias" (addressed by ID, default to own ID for "we've discovered an alias for this item")
- "merge item" (addressed by old and new ID)
- repoints any aliases
- takes user-supplied callback for merging item data and task result metadata; for any of those for which no callback is supplied, it simply picks the newest result
- when in doubt, use metadata, not data updates - data updates should typically only really be used for cases where the actual item data is collected across multiple tasks (eg. because some sites require parsing multiple pages to get all of the data) *or* potentially multiple tasks need access to it (eg. the URL to scrape)
- it may also be useful to use metadata to track the source of certain pieces of data
- in some cases it doesn't matter whether data or metadata is used, and it's up to the user which one to pick. we should document the exact tradeoffs to help along this process.
- task results can expire, either TTL or manually
- items can be manually added, eg. for seeding purposes
- each task can be rate-limited individually, either for a specific tag / set of tags, or globally
- item ID is either randomly generated or explicitly specified; the latter mode is also used for randomly generated IDs that have a prefix
- while tags determine what tasks to invoke on an object, the actual task results are associated with the object, not with the tag; because multiple tags may mark the same object for the same task. the tag-task assocation *only* serves to build a list of outstanding tasks
- TODO: eventually support binary data storage for items as well, associated with a given item
- TO FIGURE OUT: session re-establishing, like in scrappie, and how this fits into the model
- TO FIGURE OUT: how to deal with cases where it is belatedly realized that a new item should actually just be an alias to a previous item
- TO FIGURE OUT: how to handle tag renaming? this would be a stateful operation, that involves a change in the existing state in addition to a configuration change. need to figure out a UI for renaming a tag that *doesn't* run the risk of a split-brain situation when the process isn't restarted with a new config at exactly the same moment as the state gets changed.
- maybe a special boot mode that changes the state prior to starting the scraping processes?
- or let the user specify the rename in the configuration, and use some sort of version number to distinguish "old tag X" from "new tag X" when a tag name X gets reused?
- or maybe just keep it simple, let the user specify tag renames in the configuration, and just apply that rename to the full DB on every start of the process... then expect users to remove it from the renames again once they want to reuse a tag
- TO CONSIDER: if the TTL for an task gets changed, should existing task results be left at their old expiry time, or get updated to the new expiry time?
- TODO: allow marking a failure as 'handled'
- let the user define *how* to merge the data, by letting them specify a callback that receives the old data as an argument
- an item can be aliased, for handling items which have multiple canonical identifiers (eg. an internal SKU and the URL of a product page)
- any task which references item IDs can then also use the alias in its stead
- the item name itself is added as an alias to itself, to make queries simpler, as they can then just always do a lookup in the aliases table + it ensures that in-use item IDs cannot be created as an alias
- ... however, the item name is still treated as the primary/canonical identifier for display purposes, with aliases being listed separately, and so it should usually be the 'most canonical' identifier (eg. an SKU instead of a product page URL)
- when dealing with multiple namespaces (eg. SKU, EAN, URL), each identifier should ideally be prefixed as such, as in EAN:20060503
- because of this, we need to make sure to return all of these identifiers in the item lookup response as well, as the client may have gotten there through an alias, and so does not necessarily know the canonical identifier!
- aliases need to be mutable, to deal with eg. item merges
- aliases can only be single-level; any collapsing needs to be done at alias-mutation-time
- a task failing DOES NOT clear out the existing metadata+TTL, if any, for the item. likewise, new metadata (with a corresponding TTL) *may* have been set before the task failed, and this will be respected. failed items will instead be ignored by the worklist query, until the item is marked as un-failed.
- each task result stores the version of the task with which it was produced; changing the task version therefore automatically marks all entries for the old version as stale (and that is the only case where the version *should* be bumped)
- let the user specify for each task whether results should be applied or discarded upon task failure?
- seed items should be inserted conditionally, only if the ID does not already exist. for this reason, specifying an ID is mandatory for seed items.
- strongly recommend to the user that they give everything an explicit ID, because this is crucial for deduplicating items, and preventing duplicate operations from being run for tasks that have a TTL *and* where the items exist in the database multiple times due to randomly-generated IDs
- TO CONSIDER: for distributed scraping, what strategy to use?
- 1. execute task itself on worker node, but send back a log of desired changes, and assume that the code is the same on both sides, using the same callbacks on the central server end, in some way
- 2. track which data/metadata tasks tend to request, and if this occurs for more than a certain percentage of task executions, pre-fetch that data and send it along when sending the task data itself to the worker
- 3. something else?
- there are global rate limits, and then the user can also override those on a per-task basis, which will make the task subject to *both* the global ratelimit *and* the task-specific ratelimit
- allow the user to explicitly specify that they want the task exempted from the global ratelimit?
- also possible to specify a rate limit for a group of tasks, eventually, to eg. group together tasks that target the same website (including wildcard matching task names)
- TO CONSIDER: are tags that can contain values actually useful, when data can already be stored in the item data itself + the task metadata?
- TO FIGURE OUT: log method with log levels?
- TO IMPLEMENT: it must be possible for an item to set its created_by explicitly; for example, if item A produces ephemeral work item B which then results in item C, that item C should be able to claim to have been produced by item A, so that B can be safely deleted without breaking the reference chain
- "max outstanding failures for task" option before completely disabling the task, eg. to deal with upstream site changes
- plus a "mark all failures handled" operation for the task type, once it's been resolved
ITEMS
id (string)
data (json)
created_by -- references an item ID, *via* an alias (to handle item merges), and this is allowed to be a broken reference!
created_at
updated_at
ALIASES
id (string)
item_id
TAGS
id (num)
item_id
tag_name (string)
value (opt.)
TASK_RESULTS
id (num)
task (string, name)
task_version (string)
item_id
metadata (json)
is_successful (boolean) -- set to false when an error occurred during execution of the operation
is_invalidated (boolean)
created_at
expires_at
UNIQUE (task, item_id)
TASKS_IN_PROGRESS
item_id
task (string)
UNIQUE (task, item_id)
TASKS
not stored in DB but defined in configuration, and keyed by name
FAILURES
id (num)
task_result_id
error_data (json)
occurred_at
API endpoints
=============
GET /items/:id -- get current item data + metadata about it
POST /items/add
tags (opt.): string[]
data (req.): json
DELETE /items/:id
GET /items/:id/metadata
?operations: string[] -- operations to fetch the metadata for
GET /items/:id/operations -- operations that apply to this item, and the status of each (incl. expiry and whether it was successful)
POST /items/:id/operations/:operation/expire -- forcibly mark as expired
POST /items/:id -- ???
GET /tags/:tagname/worklist
?limit: integer -- maximum amount of items in the worklist to fetch
Task arguments/methods
======================
data
The currently-known item data for the item we're processing
id
The canonical ID of the item currently being processed.
NOTE: Instruct the user that this should *not* encode things like a URL to fetch, and that should be part of the item data instead. It's only meant for things like "should we rename this item from a URL to an SKU", for example.
getItem(id)
Asynchronously fetches the item data for the specified item - only typically used for items *other* than the one we're currently processing
getMetadata(taskName)
getMetadata({ id?, taskName })
Asynchronously fetches the stored metadata, if any, for the specified taskName - either for the item we're processing, or for a specified item ID.
createItem({ id?, tags?, data, failIfExists? })
Queues the creation of a new item, ignoring the create operation if the item already exists
NOTE: User can specify explicitly that it should fail with an error if the item already exists, but that's not usually recommended - instead, scraper code should robustly deal with discarded creates, which can also be an important implementation detail for performance in a distributed scraping cluster.
renameItem(to)
renameItem({ from?, to })
Changes the primary item ID for the specified (or current) item, updating all alias pointers to point at the new `to` ID.
mergeItem({ from?, into, merge: (intoOldData, fromOldData) => intoNewData, mergeMetadata: { [taskName]?: merger } })
Merges the specified (or current) item into the specified other item, using a user-supplied merging callback. Note that intoOldData comes first as a callback argument, because the user will almost always need that data, but only sometimes needs the existing data (if any) of the to-be-merged item, which will be subsequently deleted.
NOTE: This also changes all the alias pointers from the `from` to the `into`, *including* the ID of the to-be-deleted `from` item (which the user can separately delete using `deleteAlias`, if they want to). Essentially, it leaves behind a redirect.
NOTE: For any tasks which do not have a merger specified, it takes whichever set of metadata is most recent
deleteItem()
deleteItem({ id? })
Deletes the specified (or current) item from the database.
createAlias({ from, to?, failIfExists? })
Creates a new alias from the given `from` identifier to the specified `to` (or current, if unspecified) item. Silently fails if the `from` already exists, unless `failIfExists` is true.
NOTE: The `to` will be looked up in the aliases table rather than being assumed to be an item ID, to allow for collapsing alias pointers.
deleteAlias(from)
Deletes the specified alias. Fails with an error if the alias points to itself (and so is an item ID rather than a 'real' alias).
updateData((oldData) => newData)
updateData({ id?, merge: (oldData) => newData })
Queues a data update for the current or specified item, invoking a callback with the previous data as the argument
updateMetadata((oldMetadata) => newMetadata)
updateMetadata({ id?, taskName?, merge: (oldData) => newData })
Queues a data update for the current or specified task, for the current or specified item, invoking a callback with the previous data as the argument
NOTE: Always invoked, even if there's no old data in the database, in which case the `oldData` argument is an empty object. That way the user usually only needs to specify a single piece of logic to handle both fresh and updated metadata
NOTE: This should rarely be used for a taskName other than the current task! Usually data collected across multiple tasks should be stored in the item data instead. Modifying another task's result will *not* result in its TTL being reset.
expire()
expire({ id?, taskName? })
Marks the task result of the specified (or current) task as expired/invalidated, essentially immediately re-queueing it for task execution.
setTTL(ttl)
setTTL({ id?, taskName?, ttl })
Changes the TTL for the task results of the specified (or current) item and taskName. Note that this overrides the default TTL for the task, and is computed from the time of the last task result, *not* from the current time.
allowFailure(allowed)
Marks the current task as "allowed to fail" (default) or "not allowed to fail" (if `false` is passed). "Failing" means any synchronously thrown error or rejected Promise. In all cases, failures are still recorded and items are marked for re-processing - this method only controls whether the task result is discarded or stored. It defaults to disallowing failure because an error can result in corrupted data, but the user may wish to store data anyway in specific cases where they know that the data is valid despite the error.
log(category, message)
Logs a message. The category can be freely specified, but categories with built-in formatting/visibility rules are `error`, `warn`, `info`, and `debug`. Other categories may be hidden by default.
///
- data updates (in which case they are merged into the item, using a user-specified merging callback)
- when using a TTL, data updates are useful for data which is allowed to go stale but should eventually be updated
- revisions are NOT kept for data merges! data overrides are permanent and final.
- new items (in which case the new items are created, but only if the specified ID does not yet exist)
- item upserts (in which case the original item is fetched, if any, and passed to a user-specified merging callback, defaulting to empty object if nonexistent)
- metadata (in which case the task result is stored separately, addressable by a combination of item ID + task name)
- when using a TTL, metadata is useful for data which should only be taken into consideration for as long as it is fresh (as the separate storage allows for identifying the specific data that has expired)
- likewise, expired task result metadata is not kept; it's overwritten with the new metadata instead
- "delete item" (mutually exclusive with metadata and data updates) -- meant to be used for ephemeral items, like pagination URLs in a category listing
- "expire task results" (addressed by ID + operation, default to own ID for "we immediately know this data is stale but we want to prioritize un-processed items first" cases? as well as for "retry because this failed" cases, which are always explicit)
- "create alias" (addressed by ID, default to own ID for "we've discovered an alias for this item")
- "merge item" (addressed by old and new ID)
- repoints any aliases
- takes user-supplied callback for merging item data and task result metadata; for any of those for which no callback is supplied, it simply picks the newest result
///
Task recipes
============
During the task, one or more new items are discovered
Loop, create new item
During the task, we discover that some of our metadata from another task has gone stale
Expire other task's results on the item (upon discovery)
After completing this task, schedule a re-run of another dependent task
Expire other task's results on the item (always)
We've received stale data, and want to immediately schedule a new future run
Expire active task's results on the item
We've found other previously-processed items that should be merged into this one
Find/fetch other items, update current item with new data, repoint aliases from other IDs to self, delete other items
We're currently processing an item that we belatedly discover we already know about under another ID
Update other item with new data, delete self, create alias from own ID to other ID
TO FIGURE OUT: How do we store metadata for another item? Do we need an explicit storeTaskResultFor method to store the current result but associated with another item ID instead?
Queries
=======
Get work list for tag+operation:
select from items
(join tags required-on item_id = item.id and tag = :tag, results on item_id = item.id and operation = :operation)
and (item.id, :operation) not in operations_in_progress
and result is null
or result.expires_at < current_time
or is_invalidated = true
limit 10
// NOTE: Prioritizes (item,operation) pairs that do not have any result at all, only considering stale pairs after the unprocessed pairs run out
WITH candidates AS (
SELECT
DISTINCT ON (items.id)
items.*,
results.expires_at,
results.is_invalidated,
results.is_successful,
results.id AS result_id
FROM items
INNER JOIN tags
ON tags.item_id = items.id
AND tags.tag IN :tags
LEFT JOIN operation_results AS results
ON results.item_id = items.id
AND results.operation = :operation
WHERE NOT EXISTS (
SELECT FROM operations_in_progress AS pr WHERE pr.item_id = items.id
)
)
SELECT * FROM candidates
WHERE result_id IS NULL
UNION
SELECT * FROM candidates
WHERE
results.is_successful = TRUE
AND (
results.expires_at < NOW()
OR results.is_invalidated = TRUE
OR NOT (results.task_version = :taskVersion)
);
Get failures for operation type:
SELECT failures.* FROM failures
INNER JOIN task_results
ON failures.task_result_id = task_results.id
WHERE
task_results.operation = :operation

48
package.json

@ -0,0 +1,48 @@
{
"name": "scraping-server",
"version": "1.0.0",
"main": "index.js",
"repository": "git@git.cryto.net:joepie91/scraping-server.git",
"author": "Sven Slootweg <admin@cryto.net>",
"license": "WTFPL OR CC0-1.0",
"dependencies": {
"@promistream/buffer": "^0.1.1",
"@promistream/map": "^0.1.1",
"@promistream/pipe": "^0.1.2",
"@promistream/simple-sink": "^0.1.1",
"@promistream/simple-source": "^0.1.3",
"@validatem/any-property": "^0.1.3",
"@validatem/anything": "^0.1.0",
"@validatem/array-of": "^0.1.2",
"@validatem/core": "^0.3.15",
"@validatem/default-to": "^0.1.0",
"@validatem/error": "^1.1.0",
"@validatem/is-boolean": "^0.1.1",
"@validatem/is-function": "^0.1.0",
"@validatem/is-number": "^0.1.3",
"@validatem/is-string": "^1.0.0",
"@validatem/require-either": "^0.1.0",
"@validatem/required": "^0.1.1",
"@validatem/wrap-value-as-option": "^0.1.0",
"as-expression": "^1.0.0",
"bluebird": "^3.7.2",
"chalk": "^4.1.0",
"create-error": "^0.3.1",
"date-fns": "^2.18.0",
"debug": "^4.3.1",
"default-value": "^1.0.0",
"express": "^4.17.1",
"express-promise-router": "^4.0.1",
"knex": "^0.21.17",
"map-obj": "^4.2.0",
"ms": "^2.1.3",
"objection": "^2.2.14",
"pg": "^8.5.1",
"syncpipe": "^1.0.0",
"yargs": "^16.2.0"
},
"devDependencies": {
"@joepie91/eslint-config": "^1.1.0",
"eslint": "^7.21.0"
}
}

9
src/errors.js

@ -0,0 +1,9 @@
"use strict";
const createError = require("create-error");
// TODO: Make nice
module.exports = {
NotFound: createError("NotFound")
};

71
src/index.js

@ -0,0 +1,71 @@
"use strict";
const Promise = require("bluebird");
const express = require("express");
const expressPromiseRouter = require("express-promise-router");
const initialize = require("./initialize");
return Promise.try(() => {
return initialize();
}).then((state) => {
let { db, knex } = state;
let app = express();
let router = expressPromiseRouter();
router.get("/items/:id", (req, res) => {
return Promise.try(() => {
return db.Item.query(knex).findById(req.params.id);
}).then((item) => {
if (item != null) {
res.json(item);
} else {
throw new errors.NotFound(`No such item exists`);
}
});
});
router.delete("/items/:id", (req, res) => {
return Promise.try(() => {
return db.Item.query(knex)
.findById(req.params.id)
.delete();
}).then((affectedRows) => {
if (affectedRows > 0) {
res.status(204).end();
} else {
throw new errors.NotFound(`No such item exists`);
}
});
});
// MARKER: Stub out routes, replace error implementation, add response generation for HTTP errors, write database migration, implement parsing engine core
router.get("/items/:id/metadata", (req, res) => {
});
router.get("/items/:id/operations", (req, res) => {
});
router.get("/items/:id/operations/:operation/expire", (req, res) => {
});
router.post("/items/add", (req, res) => {
});
router.put("/items/add/:id", (req, res) => {
});
router.get("/items/:id", (req, res) => {
});
app.use(router);
app.listen(3000);
});

33
src/initialize.js

@ -0,0 +1,33 @@
"use strict";
const Promise = require("bluebird");
const path = require("path");
const knex = require("knex");
const { knexSnakeCaseMappers } = require("objection");
const models = require("./models");
let migrationsFolder = path.join(__dirname, "../migrations");
module.exports = function initialize({ knexfile }) {
let knexInstance = knex({
... knexfile,
... knexSnakeCaseMappers()
});
let state = {
knex: knexInstance
};
return Promise.try(() => {
return knexInstance.migrate.latest({
directory: migrationsFolder
});
}).then(() => {
return {
... state,
db: models(state)
};
});
};

164
src/kernel.js

@ -0,0 +1,164 @@
"use strict";
const Promise = require("bluebird");
const defaultValue = require("default-value");
const chalk = require("chalk");
const util = require("util");
const syncpipe = require("syncpipe");
const rateLimit = require("@ppstreams/rate-limit");
const simpleSink = require("@promistream/simple-sink");
const pipe = require("@promistream/pipe");
const parallelize = require("@ppstreams/parallelize");
const initialize = require("./initialize");
// TODO: Publish this as a separate package
// Inverts an object of arrays, eg. {a: [x, y], b: [x, z]} becomes {x: [a, b], y: [a], z: [b]}
// Useful for eg. tag mappings
function invertMapping(object) {
let newObject = {};
for (let [ key, valueList ] of Object.entries(object)) {
for (let value of valueList) {
if (newObject[value] == null) {
newObject[value] = [];
}
newObject[value].push(key);
}
}
return newObject;
}
function log(value) {
console.log(value);
return value;
}
module.exports = function createKernel(configuration) {
return Promise.try(() => {
return initialize({
knexfile: {
client: "pg",
connection: configuration.database
}
});
}).then((state) => {
const queries = require("./queries")(state);
const createTaskStream = require("./task-stream")(state);
let { knex } = state;
function insertSeeds() {
return Promise.map(configuration.seed, (item) => {
return queries.createItem(knex, {
... item,
allowUpsert: false,
failIfExists: false
});
});
}
function checkLockedTasks() {
return Promise.try(() => {
return queries.countLockedTasks(knex);
}).then((lockedCount) => {
if (lockedCount > 0) {
console.log(`${chalk.bold.red("WARNING:")} There are ${lockedCount} tasks currently locked, and they will not be run! This may be caused by a process crash in the past. See the documentation for more details on how to solve this issue.`);
}
});
}
function runTaskStreams() {
let tasks = invertMapping(configuration.tags);
let attachToGlobalRateLimit = (configuration.taskInterval != null)
? rateLimit.clonable(configuration.taskInterval)
: undefined;
console.log(`Starting ${Object.keys(tasks).length} tasks...`);
return Promise.map(Object.entries(tasks), ([ task, tags ]) => {
let taskConfiguration = configuration.tasks[task];
if (taskConfiguration != null) {
let taskStream = createTaskStream({
task: task,
tags: tags,
taskVersion: defaultValue(taskConfiguration.version, "0"),
ttl: taskConfiguration.ttl,
run: taskConfiguration.run,
globalRateLimiter: (attachToGlobalRateLimit != null)
? attachToGlobalRateLimit()
: null,
globalParallelize: (configuration.parallelTasks != null)
? parallelize(configuration.parallelTasks)
: null
});
return pipe([
taskStream,
simpleSink((completedItem) => {
console.log(`[completed] ${completedItem.id}`);
})
]).read();
} else {
throw new Error(`Task '${task}' is defined to run for tags [${tags}], but no such task is defined`);
}
});
}
function simulateTask(id, task) {
let taskConfiguration = configuration.tasks[task];
let methods = [ "createItem", "renameItem", "mergeItem", "deleteItem", "createAlias", "deleteAlias", "updateData", "updateMetadata", "expire" ];
let simulatedMethods = syncpipe(methods, [
(_) => _.map((method) => [ method, function() {
console.log(`${chalk.bold.yellow.bgBlack(`${method} (simulated):`)} ${util.inspect(arguments, { colors: true, depth: null })}`);
}]),
(_) => Object.fromEntries(_)
]);
return Promise.try(() => {
return queries.getItem(knex, id);
}).then((item) => {
return taskConfiguration.run({
id: item.id,
data: item.data,
getItem: function (id) {
return queries.getItem(knex, id);
},
... simulatedMethods
});
});
}
return {
run: function runKernel() {
return Promise.try(() => {
return insertSeeds();
}).then(() => {
return checkLockedTasks();
}).then(() => {
return runTaskStreams();
});
},
simulate: function simulate({ itemID, task }) {
return Promise.try(() => {
return insertSeeds();
}).then(() => {
return checkLockedTasks();
}).then(() => {
return simulateTask(itemID, task);
});
},
shutdown: function () {
// TODO: Properly lock all public methods after shutdown is called, and wait for any running tasks to have completed
knex.destroy();
}
};
});
};

20
src/models/alias.js

@ -0,0 +1,20 @@
"use strict";
const { Model } = require("objection");
module.exports = function ({ db }) {
return class Alias extends Model {
static tableName = "aliases";
static idColumn = "alias";
static get relationMappings() {
return {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "aliases.itemId", to: "items.id" }
}
};
};
}
};

19
src/models/failure.js

@ -0,0 +1,19 @@
"use strict";
const { Model } = require("objection");
module.exports = function ({ db }) {
return class Failure extends Model {
static tableName = "failures";
static get relationMappings() {
return {
taskResult: {
relation: Model.BelongsToOneRelation,
modelClass: db.TaskResult,
join: { from: "failures.taskResultId", to: "taskResults.id" }
}
};
};
}
};

17
src/models/index.js

@ -0,0 +1,17 @@
"use strict";
module.exports = function (state) {
let db = {};
let combinedState = { ... state, db: db };
Object.assign(db, {
Alias: require("./alias")(combinedState),
Failure: require("./failure")(combinedState),
Item: require("./item")(combinedState),
Tag: require("./tag")(combinedState),
TaskInProgress: require("./task-in-progress")(combinedState),
TaskResult: require("./task-result")(combinedState),
});
return db;
};

45
src/models/item.js

@ -0,0 +1,45 @@
"use strict";
const { Model } = require("objection");
module.exports = function ({ db }) {
return class Item extends Model {
static tableName = "items";
static get relationMappings() {
return {
aliases: {
relation: Model.HasManyRelation,
modelClass: db.Alias,
join: { from: "items.id", to: "aliases.itemId" }
},
tags: {
relation: Model.HasManyRelation,
modelClass: db.Tag,
join: { from: "items.id", to: "tags.itemId" }
},
taskResults: {
relation: Model.HasManyRelation,
modelClass: db.TaskResult,
join: { from: "items.id", to: "taskResults.itemId" }
},
tasksInProgress: {
relation: Model.HasManyRelation,
modelClass: db.TaskInProgress,
join: { from: "items.id", to: "tasksInProgress.itemId" }
},
failedTasks: {
// Not actually a many-to-many, but that's what objection calls a HasManyThrough...
// https://github.com/Vincit/objection.js/issues/1148
relation: Model.ManyToManyRelation,
modelClass: db.Failure,
join: {
from: "items.id",
through: { from: "task_results.itemId", to: "task_results.id" },
to: "failures.taskResultId"
}
}
};
};
}
};

29
src/models/tag.js

@ -0,0 +1,29 @@
"use strict";
const { Model, QueryBuilder } = require("objection");
module.exports = function ({ db }) {
return class Tag extends Model {
static tableName = "tags";
static get relationMappings() {
return {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "tags.itemId", to: "item.id" }
}
};
};
static QueryBuilder = class TagQueryBuilder extends QueryBuilder {
findAllByName(name) {
return this.where({ name: name });
}
renameAll(oldName, newName) {
return this.patch({ name: newName }).where({ name: oldName });
}
};
}
};

20
src/models/task-in-progress.js

@ -0,0 +1,20 @@
"use strict";
const { Model } = require("objection");
module.exports = function ({ db }) {
return class TaskInProgress extends Model {
static tableName = "tasksInProgress";
static idColumn = [ "task", "itemId" ];
static get relationMappings() {
return {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "tasksInProgress.itemId", to: "item.id" }
}
};
};
}
};

20
src/models/task-result.js

@ -0,0 +1,20 @@
"use strict";
const { Model } = require("objection");
module.exports = function ({ db }) {
return class TaskResult extends Model {
static tableName = "taskResults";
static idColumn = [ "task", "itemId" ];
static get relationMappings() {
return {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "taskResults.itemId", to: "item.id" }
}
};
};
}
};

57
src/mutation-api/database.js

@ -0,0 +1,57 @@
"use strict";
const defaultValue = require("default-value");
module.exports = function (state) {
const queries = require("../queries")(state);
return function createDatabaseMutationAPI({ tx, item, taskVersion }) {
return {
createItem: function (options) {
console.log(`[new] ${options.id}`);
return queries.createItem(tx, {
...options,
parentID: item.id
});
},
renameItem: function (options) {
return queries.renameItem(tx, options);
},
mergeItem: function (options) {
return queries.mergeItem(tx, {
...options,
from: defaultValue(options.from, item.id)
});
},
deleteItem: function (options) {
return queries.deleteItem(tx, {
id: defaultValue(options.id, item.id)
});
},
createAlias: function (options) {
return queries.createAlias(tx, {
...options,
to: defaultValue(options.to, item.id)
});
},
deleteAlias: function (from) {
return queries.deleteAlias(tx, {
from: from
});
},
updateData: function (options) {
return queries.updateData(tx, options);
},
updateMetadata: function (options) {
return queries.updateMetadata(tx, {
... options,
taskVersion: taskVersion
});
},
expire: function (options) {
return queries.expire(tx, options);
}
};
};
};

77
src/mutation-api/simulation.js

@ -0,0 +1,77 @@
"use strict";
const Promise = require("bluebird");
const chalk = require("chalk");
const util = require("util");
function logCall(methodName, args) {
console.log(`${chalk.bold.yellow.bgBlack(`${methodName} (simulated):`)} ${util.inspect(args, { colors: true, depth: null })}`);
}
module.exports = function (state) {
const queries = require("../queries")(state);
return function createSimulationMutationAPI({ tx, item, taskVersion }) {
return {
createItem: function (options) {
logCall("createItem", {
... options,
update: (options.update != null)
? options.update(item.data)
: undefined
});
},
renameItem: function (options) {
logCall("renameItem", {
... options
});
},
mergeItem: function (options) {
// FIXME: Visualize merges
logCall("createItem", {
... options
});
},
deleteItem: function (options) {
logCall("deleteItem", {
... options
});
},
createAlias: function (options) {
logCall("createAlias", {
... options
});
},
deleteAlias: function (from) {
logCall("deleteAlias", {
from: from
});
},
updateData: function (options) {
logCall("updateData", {
... options,
update: (options.update != null)
? options.update(item.data)
: undefined
});
},
updateMetadata: function (options) {
return Promise.try(() => {
return queries.getItem(tx, id);
}).then((item) => {
// FIXME: Visualize metadata update, actually merge with the correct thing
// MARKER: Expose taskResults as an object mapping instead of an array, somehow
logCall("updateMetadata", {
... options,
update: (options.update != null)
? options.update(item.data)
: undefined
});
});
},
expire: function (options) {
return queries.expire(tx, options);
}
};
};
};

73
src/mutation-api/wrapper.js

@ -0,0 +1,73 @@
"use strict";
const wrapValueAsOption = require("@validatem/wrap-value-as-option");
const required = require("@validatem/required");
const isString = require("@validatem/is-string");
const defaultTo = require("@validatem/default-to");
const validateOptions = require("@validatem/core/src/api/validate-options");
const isFunction = require("@validatem/is-function");
// FIXME: Remaining validators
module.exports = function wrapMutationAPI({ item, task }, api) {
return {
createItem: function (options) {
return api.createItem(options);
},
renameItem: function (_options) {
let options = validateOptions(arguments, [
wrapValueAsOption("to"), {
to: [ required, isString ],
from: [ defaultTo(item.id), isString ]
}
]);
return api.renameItem(options);
},
mergeItem: function (options) {
return api.mergeItem(options);
},
deleteItem: function (options = {}) {
return api.deleteItem(options);
},
createAlias: function (options) {
// TODO: Single-parameter variant?
return api.createAlias(options);
},
deleteAlias: function (from) {
// FIXME: options wrapper
return api.deleteAlias(from);
},
updateData: function (_options) {
let options = validateOptions(arguments, [
wrapValueAsOption("update"), {
id: [ defaultTo(item.id), isString ],
update: [ required, isFunction ]
}
]);
return api.updateData(options);
},
updateMetadata: function (_options) {
let options = validateOptions(arguments, [
wrapValueAsOption("update"), {
id: [ defaultTo(item.id), isString ],
taskName: [ defaultTo(task), isString ],
update: [ required, isFunction ]
}
]);
return api.updateMetadata(options);
},
expire: function (_options) {
let options = validateOptions(arguments, [
defaultTo({}), {
id: [ defaultTo(item.id), isString ],
taskName: [ defaultTo(task), isString ]
}
]);
return api.expire(options);
}
};
};

381
src/queries.js

@ -0,0 +1,381 @@
"use strict";
const Promise = require("bluebird");
const { UniqueViolationError } = require("objection");
const asExpression = require("as-expression");
const { validateArguments } = require("@validatem/core");
const required = require("@validatem/required");
const requireEither = require("@validatem/require-either");
const isString = require("@validatem/is-string");
const isBoolean = require("@validatem/is-boolean");
const isFunction = require("@validatem/is-function");
const isNumber = require("@validatem/is-number");
const arrayOf = require("@validatem/array-of");
const defaultTo = require("@validatem/default-to");
const anyProperty = require("@validatem/any-property");
const anything = require("@validatem/anything");
const ValidationError = require("@validatem/error");
const { addSeconds } = require("date-fns");
const syncpipe = require("syncpipe");
const defaultValue = require("default-value");
function isTX(value) {
if (value.where == null || value.raw == null) {
throw new ValidationError(`Must be a valid Knex or Knex transaction instance`);
}
}
function noop() {}
function taskResultsToObject(taskResults) {
return syncpipe(taskResults, [
(_) => [ _.taskName, _.metadata ],
(_) => Object.fromEntries(_)
]);
}
module.exports = function ({ db }) {
return {
getItem: function (tx, id) {
return Promise.try(() => {
return db.Alias.relatedQuery("item", tx)
.for(id)
.withGraphFetched("taskResults");
}).then((results) => {
if (results.length > 0) {
return results[0];
} else {
throw new Error(`No item exists with ID '${id}'`);
}
});
},
createItem: function (_tx, _options) {
// NOTE: Using `update` instead of `data` makes it an upsert!
// FIXME: Make failIfExists actually work, currently it does nothing as the UNIQUE constraint violation cannot occur for an upsert
let [ tx, { id, tags, aliases, data, update, failIfExists, allowUpsert, parentID }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
id: [ required, isString ],
tags: [ defaultTo([]), arrayOf(isString) ],
aliases: [ defaultTo([]), arrayOf(isString) ],
data: [ anything ], // FIXME: Check for object
update: [ isFunction ],
failIfExists: [ defaultTo(false), isBoolean ],
allowUpsert: [ defaultTo(true), isBoolean ],
parentID: [ isString ]
}, requireEither([ "data", "update" ]) ]
});
// FIXME: Ensure that we run the transaction in full isolation mode, and retry in case of a conflict
return Promise.try(() => {
// NOTE: We look up by alias, since this is an upsert - and so if the specified ID already exists as an alias, we should update the existing item instead of creating a new one with the specified (aliased) ID
return db.Alias
.relatedQuery("item", tx)
.for(id);
}).then((existingItems) => {
let existingItem = existingItems[0];
let actualID = (existingItem != null)
? existingItem.id
: id;
let existingData = (existingItem != null)
? existingItem.data
: {};
let newData = (update != null)
? update(existingData)
: { ... existingData, ... data };
// Make sure to add a self:self alias
let allAliases = aliases.concat([ actualID ]);
let newItem = {
id: actualID,
data: newData,
createdBy: parentID,
tags: tags.map((tag) => ({ name: tag })),
aliases: allAliases.map((alias) => ({ alias: alias })),
updatedAt: new Date()
};
if (allowUpsert) {
// NOTE: We *always* do upserts here, even if the user specified `data` rather than `update`, because tags and aliases should always be added even if the item itself already exists. We trust the user not to accidentally reuse IDs between different kinds of objects (which would break in various other ways anyway).
return db.Item.query(tx).upsertGraph(newItem, {
insertMissing: true,
noDelete: true
});
} else {
return db.Item.query(tx).insertGraph(newItem, {
insertMissing: true
});
}
}).catch({ name: "UniqueViolationError", table: "items" }, (error) => {
if (failIfExists) {
throw error;
} else {
// Do nothing, just ignore the failure
}
});
},
renameItem: function (_tx, _options) {
// options == to || { from, to }
let [ tx, { to, from }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
to: [ required, isString ],
from: [ required, isString ]
}]
});
return Promise.all([
db.Item.query(tx).findById(from).patch({ id: to }),
this.createAlias(tx, { from: to, to: to })
]);
},
repointAliases: function (_tx, _options) {
// { from, to }
let [ tx, { to, from }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
to: [ required, isString ],
from: [ required, isString ]
}]
});
return db.Alias.query(tx)
.patch({ itemId: to })
.where({ itemId: from });
},
mergeItem: function (_tx, _options) {
// options = { from, into, merge, mergeMetadata{} }
let [ tx, { from, into, merge, mergeMetadata }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
from: [ required, isString ],
into: [ required, isString ],
merge: [ required, isFunction ],
mergeMetadata: [ defaultTo({}), anyProperty({
key: [ required ],
value: [ required, isFunction ]
})],
}]
});
return Promise.all([
this.getItem(tx, { id: from }),
this.getItem(tx, { id: into }),
]).then(([ from, into ]) => {
let newData = merge(into.data, from.data);
let fromTaskResults = taskResultsToObject(from.taskResults);
let intoTaskResults = taskResultsToObject(into.taskResults);
// FIXME: Deduplicate function
let allTaskKeys = Array.from(new Set([
... Object.keys(fromTaskResults),
... Object.keys(intoTaskResults)
]));
function selectNewestResult(taskA, taskB) {
if (taskA == null) {
return taskB;
} else if (taskB == null) {
return taskA;
} else if (taskA.updatedAt > taskB.updatedAt) {
return taskA;
} else {
return taskB;
}
}
// TODO: Use merge-by-template here instead?
let newTaskResults = allTaskKeys.map((key) => {
let merger = mergeMetadata[key];
let fromTask = fromTaskResults[key];
let intoTask = intoTaskResults[key];
if (merger != null) {
// Generate a new TaskResult that includes data combined from both
let newMetadata = merger(
defaultValue(intoTask.metadata, {}),
defaultValue(fromTask.metadata, {})
);
return {
... intoTask,
metadata: newMetadata,
updatedAt: Date.now()
};
} else {
// Take the newest known TaskResult and just make sure that it is pointing at the correct ID
return {
... selectNewestResult(intoTask, fromTask),
itemId: into.id
};
}
});
let upsertOptions = {
insertMissing: true,
noDelete: true
};
return Promise.try(() => {
return into.$query(tx).upsertGraph({
data: newData,
taskResults: newTaskResults
}, upsertOptions);
}).then(() => {
// NOTE: Repointing aliases has the side-effect of leaving a redirect from the source to the destination item, as each item has a self:self alias
return this.repointAliases(tx, { from: from.id, to: into.id });
}).then(() => {
// NOTE: We don't use this.deleteItem, to sidestep any alias lookups
return db.Item.query(tx).findById(from.id).delete();
});
});
},
deleteItem: function (_tx, _options) {
// options = none || { id }
let [ tx, { id }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
id: [ required, isString ]
}]
});
return db.Alias.relatedQuery("item", tx)
.for(id)
.delete();
// return db.Item.query(tx).findById(id).delete();
},
createAlias: function (_tx, _options) {
// options = { from, to, failIfExists }
let [ tx, { from, to, failIfExists }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
from: [ required, isString ],
to: [ required, isString ],
failIfExists: [ defaultTo(false), isBoolean ] // TODO: Shouldn't this default to true, for any occurrence outside of a merge/rename?
}]
});
let promise = db.Alias.query(tx).insert({ alias: from, itemId: to });
if (failIfExists) {
return promise;
} else {
return promise.catch(UniqueViolationError, noop);
}
},
deleteAlias: function (_tx, _options) {
let [ tx, { from }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
from: [ required, isString ]
}]
});
return db.Alias.query(tx).findById(from).delete();
},
updateData: function (_tx, _options) {
// options = update || { id, update }
let [ tx, { id, update }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
id: [ required, isString ],
update: [ required, isFunction ]
}]
});
// TODO: Figure out the proper delineation between 'creating' and 'updating' an item
return this.createItem(tx, { id, update });
},
updateMetadata: function (_tx, _options) {
// options = update || { id, update, taskName }
let [ tx, { id, update, taskName, taskVersion, ttl }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
id: [ required, isString ],
update: [ required, isFunction ],
taskName: [ required, isString ],
taskVersion: [ required, isString ],
ttl: [ isNumber ]
}]
});
// TODO: failIfExists
// FIXME: metadata_updated_at
return Promise.try(() => {
return db.Alias.query(tx).findById(id);
}).then((alias) => {
let sharedFields = {
isSuccessful: true,
isInvalidated: false,
taskVersion: taskVersion,
updatedAt: new Date(),
expiresAt: (ttl != null)
? addSeconds(new Date(), ttl)
: undefined
};
if (alias != null) {
return Promise.try(() => {
return db.TaskResult.query(tx).findById([ taskName, alias.itemId ]);
}).then((taskResult) => {
if (taskResult != null) {
return taskResult.$query(tx).patch({
... sharedFields,
metadata: update(taskResult.metadata),
});
} else {
return db.TaskResult.query(tx).insert({
... sharedFields,
task: taskName,
itemId: id,
metadata: update({})
});
}
});
}
});
},
expire: function (_tx, _options) {
// options = none || { id, taskName }
let [ tx, { id, taskName }] = validateArguments(arguments, {
tx: [ required, isTX ],
options: [{
id: [ required, isString ],
taskName: [ required, isString ]
}]
});
return db.Alias
.relatedQuery("item.taskResults", tx).for(id)
.where({ taskName: taskName })
.patch({ isInvalidated: true });
},
setTTL: function (options) {
// options = ttl || { id, taskName, ttl }
// FIXME
},
allowFailure: function (allowed) {
},
log: function (category, message) {
},
countLockedTasks: function (tx) {
return Promise.try(() => {
return db.TaskInProgress.query(tx).count({ count: "*" });
}).then((result) => {
return result[0].count;
});
}
};
};

47
src/streams/process-task-safely.js

@ -0,0 +1,47 @@
"use strict";
const Promise = require("bluebird");
const { UniqueViolationError } = require("objection");
const pipe = require("@promistream/pipe");
const map = require("@promistream/map");
const mapFilter = require("@promistream/map-filter");
module.exports = function ({ db, knex }) {
return function processTaskSafely(task, processHandler) {
let lockStream = mapFilter((item) => {
return Promise.try(() => {
return db.TaskInProgress.query(knex).insert({
task: task,
item_id: item.id
});
}).then(() => {
return item;
}).catch(UniqueViolationError, () => {
return mapFilter.NoValue;
});
});
let processUnlockStream = map((item) => {
return Promise.try(() => {
return knex.transaction((tx) => {
return processHandler(item, tx);
});
}).finally(() => {
return db.TaskInProgress.query(knex)
.delete()
.where({
task: task,
item_id: item.id
});
}).then(() => {
return item;
});
});
return pipe([
lockStream,
processUnlockStream
]);
};
};

154
src/task-stream.js

@ -0,0 +1,154 @@
"use strict";
const Promise = require("bluebird");
const ms = require("ms");
const dateFns = require("date-fns");
const syncpipe = require("syncpipe");
const debug = require("debug")("scrapingserver");