You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
srap/src/task-stream.js

197 lines
5.6 KiB
JavaScript

4 years ago
"use strict";
const Promise = require("bluebird");
const ms = require("ms");
const dateFns = require("date-fns");
const debug = require("debug")("scrapingserver");
3 years ago
const chalk = require("chalk");
4 years ago
const simpleSource = require("@promistream/simple-source");
const buffer = require("@promistream/buffer");
const pipe = require("@promistream/pipe");
4 years ago
const rateLimit = require("@promistream/rate-limit");
3 years ago
const parallelize = require("@promistream/parallelize");
4 years ago
4 years ago
const logStatus = require("./log-status");
// const { UniqueViolationError } = require("objection");
4 years ago
// FIXME: Revert inlining of task_states once switched to PostgreSQL 12+, which can do this automatically using NOT MATERIALIZED
3 years ago
// FIXME: Check whether the dependency task_versions are actually being correctly passed in, and aren't accidentally nulls
4 years ago
let query = `
4 years ago
WITH
dependency_tasks AS (
SELECT * FROM
json_to_recordset(:dependencyTaskDefinitions) AS x(task text, task_version text)
),
matching_items AS (
4 years ago
SELECT
3 years ago
DISTINCT ON (srap_items.id)
srap_items.*,
results.updated_at AS result_date,
results.task_version,
4 years ago
(
results.is_successful = TRUE
4 years ago
AND (
results.expires_at < NOW()
OR results.is_invalidated = TRUE
4 years ago
)
) AS is_candidate
FROM srap_items
3 years ago
INNER JOIN srap_tags
ON srap_tags.item_id = srap_items.id
AND srap_tags.name = ANY(:tags)
LEFT JOIN srap_task_results AS results
ON results.item_id = srap_items.id
AND results.task = :task
4 years ago
WHERE
NOT EXISTS (
SELECT FROM srap_tasks_in_progress AS pr WHERE pr.item_id = srap_items.id
4 years ago
)
),
candidates AS (
SELECT * FROM matching_items
WHERE result_date IS NULL
UNION
SELECT * FROM matching_items
WHERE is_candidate = TRUE
OR NOT (task_version = :taskVersion)
)
(
4 years ago
SELECT
4 years ago
*
FROM
candidates
WHERE
NOT EXISTS (
SELECT
results.*
4 years ago
FROM dependency_tasks
3 years ago
LEFT JOIN srap_task_results AS results
ON dependency_tasks.task = results.task
AND dependency_tasks.task_version = results.task_version
AND results.item_id = candidates.id
4 years ago
WHERE
results.is_successful IS NULL
3 years ago
OR results.is_successful = FALSE
OR (
results.is_successful = TRUE
AND (
results.expires_at < NOW()
OR results.is_invalidated = TRUE
)
)
4 years ago
)
) LIMIT :resultLimit;
`;
module.exports = function (state) {
const processTaskSafely = require("./streams/process-task-safely")(state);
const queries = require("./queries")(state);
3 years ago
const createDatabaseQueue = require("./queued-database-api")(state);
4 years ago
let { knex, db } = state;
// FIXME: Transaction support!
4 years ago
return function createTaskStream({ task, taskVersion, taskDependencies, taskDependents, taskInterval, tags, run, ttl, globalRateLimiter, globalParallelize, parallelTasks }) {
4 years ago
// TODO: Make nicer
let ttlInSeconds = (ttl != null)
? (typeof ttl === "number")
? ttl / 1000
: ms(ttl) / 1000
: undefined;
return pipe([
simpleSource(() => {
let startTime = Date.now();
return Promise.try(() => {
// console.log("Fetching new batch");
return knex.raw(query, {
tags: tags,
task: task,
taskVersion: taskVersion,
4 years ago
resultLimit: 1000, // TODO: Make configurable
dependencyTaskDefinitions: JSON.stringify(taskDependencies.map((dependency) => {
// Case-mapping for SQL compatibility
return { task_version: dependency.taskVersion, task: dependency.task };
}))
4 years ago
});
}).then((result) => {
let timeElapsed = Date.now() - startTime;
4 years ago
debug(`Task retrieval query for '${task}' took ${timeElapsed}ms and produced ${result.rowCount} results`);
4 years ago
if (result.rowCount > 0) {
// console.log("rows:", result.rows);
return result.rows;
} else {
4 years ago
// FIXME: Make this delay configurable, or maybe even use LISTEN/NOTIFY
return Promise.resolve([]).delay(30000);
4 years ago
}
});
}),
buffer(),
globalRateLimiter,
4 years ago
(taskInterval != null)
? rateLimit(taskInterval)
: null,
4 years ago
processTaskSafely(task, (item, tx) => {
4 years ago
logStatus(task, chalk.bold.cyan, "started", item.id);
4 years ago
3 years ago
let queue = createDatabaseQueue({ tx, item, task, taskVersion, taskDependents, taskDependencies });
4 years ago
return Promise.try(() => {
// TODO: Proper Validatem schemas for each API method
return run({
id: item.id,
data: item.data,
getItem: function (id) {
return queries.getItem(tx, id);
},
3 years ago
... queue.api
4 years ago
});
}).then(() => {
3 years ago
return queue.execute();
4 years ago
}).then(() => {
// Update succeeded
return db.TaskResult.query(tx).findById([ task, item.id ]).patch({
is_successful: true,
updated_at: new Date(),
4 years ago
expires_at: (ttlInSeconds != null)
? dateFns.add(new Date(), { seconds: ttlInSeconds })
: null
4 years ago
});
}).catch((error) => {
4 years ago
logStatus(task, chalk.bold.red, "failed", `${item.id}: ${error.stack}`);
4 years ago
3 years ago
let commonUpdate = {
is_successful: false,
task_version: taskVersion
};
4 years ago
return Promise.try(() => {
// Task failed -- note, cannot use tx here because it has failed
3 years ago
return db.TaskResult.query(knex).insert({
item_id: item.id,
task: task,
metadata: {},
... commonUpdate
});
}).catch({ name: "UniqueViolationError" }, () => {
4 years ago
return db.TaskResult.query(knex).findById([ task, item.id ]).patch({
3 years ago
... commonUpdate
4 years ago
});
}).then(() => {
// throw error;
});
});
}),
4 years ago
// TODO: Sort out a cleaner way to organize local vs. global parallelization
(parallelTasks != null)
? parallelize(parallelTasks)
: globalParallelize
4 years ago
]);
};
};