Refactor to pre-generate task queue for performance, fix some refactoring errors, fix some model bugs

This commit is contained in:
Sven Slootweg 2022-11-23 20:15:52 +01:00
parent fb93e902a8
commit b9b0e63454
11 changed files with 131 additions and 36 deletions

21
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,21 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": [
"<node_internals>/**"
],
"program": "./bin/simulate",
"args": ["../seekseek/scraper-config/", "lcsc:normalizeProduct", "lcsc:product:C494972"],
"env": {
"DEBUG": "srap:backend:postgresql:*"
}
}
]
}

View file

@ -0,0 +1,19 @@
"use strict";
module.exports.up = function(knex, Promise) {
return knex.schema
.alterTable("srap_tasks_in_progress", (table) => {
table.timestamp("started_at").alter().nullable().defaultTo(null);
table.boolean("started").notNullable().defaultTo(false);
})
.renameTable("srap_tasks_in_progress", "srap_queue");
};
module.exports.down = function(knex, Promise) {
return knex.schema
.renameTable("srap_queue", "srap_tasks_in_progress")
.alterTable("srap_tasks_in_progress", (table) => {
table.timestamp("started_at").alter().notNullable().defaultTo(knex.fn.now());
table.dropColumn("started");
});
};

View file

@ -237,7 +237,7 @@ module.exports = function (state) {
}); });
return mutableOperation((tx) => { return mutableOperation((tx) => {
return backend.moveItem(tx, { options, allowMerge: (options.merge != null) }); return backend.moveItem(tx, { ... options, allowMerge: (options.merge != null) });
}); });
}, },

View file

@ -85,10 +85,9 @@ module.exports = function(state) {
lock: function (tx, { id, task }) { lock: function (tx, { id, task }) {
return Promise.try(() => { return Promise.try(() => {
return db.TaskInProgress.query(tx).insert({ // FIXME: use Objection for this?
task: task.name, // FIXME: Read item first to make it prevent edit conflicts via transaction
item_id: id return tx.raw(`UPDATE srap_queue SET started = TRUE, started_at = NOW() WHERE task = :task AND item_id = :id`, { task: task.name, id: id });
});
}).then(() => { }).then(() => {
return true; return true;
}).catch({ name: "UniqueViolationError" }, () => { }).catch({ name: "UniqueViolationError" }, () => {
@ -338,7 +337,7 @@ module.exports = function(state) {
countLockedTasks: function (tx) { countLockedTasks: function (tx) {
return Promise.try(() => { return Promise.try(() => {
return db.TaskInProgress.query(tx).count({ count: "*" }); return db.TaskInProgress.query(tx).count({ count: "*" }).where({ started: true });
}).then((result) => { }).then((result) => {
return result[0].count; return result[0].count;
}); });

View file

@ -26,7 +26,7 @@ module.exports = function ({ db }) {
tasksInProgress: { tasksInProgress: {
relation: Model.HasManyRelation, relation: Model.HasManyRelation,
modelClass: db.TaskInProgress, modelClass: db.TaskInProgress,
join: { from: "srap_items.id", to: "srap_tasksInProgress.itemId" } join: { from: "srap_items.id", to: "srap_queue.itemId" }
}, },
failedTasks: { failedTasks: {
// Not actually a many-to-many, but that's what objection calls a HasManyThrough... // Not actually a many-to-many, but that's what objection calls a HasManyThrough...
@ -35,7 +35,7 @@ module.exports = function ({ db }) {
modelClass: db.Failure, modelClass: db.Failure,
join: { join: {
from: "srap_items.id", from: "srap_items.id",
through: { from: "srap_task_results.itemId", to: "srap_task_results.id" }, through: { from: "srap_taskResults.itemId", to: "srap_taskResults.id" },
to: "srap_failures.taskResultId" to: "srap_failures.taskResultId"
} }
} }

View file

@ -11,7 +11,7 @@ module.exports = function ({ db }) {
item: { item: {
relation: Model.BelongsToOneRelation, relation: Model.BelongsToOneRelation,
modelClass: db.Item, modelClass: db.Item,
join: { from: "srap_tags.itemId", to: "srap_item.id" } join: { from: "srap_tags.itemId", to: "srap_items.id" }
} }
}; };
}; };

View file

@ -4,7 +4,7 @@ const { Model } = require("objection");
module.exports = function ({ db }) { module.exports = function ({ db }) {
return class TaskInProgress extends Model { return class TaskInProgress extends Model {
static tableName = "srap_tasksInProgress"; static tableName = "srap_queue";
static idColumn = [ "task", "itemId" ]; static idColumn = [ "task", "itemId" ];
static get relationMappings() { static get relationMappings() {
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
item: { item: {
relation: Model.BelongsToOneRelation, relation: Model.BelongsToOneRelation,
modelClass: db.Item, modelClass: db.Item,
join: { from: "srap_tasksInProgress.itemId", to: "srap_item.id" } join: { from: "srap_queue.itemId", to: "srap_items.id" }
} }
}; };
}; };

View file

@ -12,7 +12,7 @@ module.exports = function ({ db }) {
item: { item: {
relation: Model.BelongsToOneRelation, relation: Model.BelongsToOneRelation,
modelClass: db.Item, modelClass: db.Item,
join: { from: "srap_taskResults.itemId", to: "srap_item.id" } join: { from: "srap_taskResults.itemId", to: "srap_items.id" }
} }
}; };
}; };

View file

@ -6,7 +6,20 @@ const buffer = require("@promistream/buffer");
const pipe = require("@promistream/pipe"); const pipe = require("@promistream/pipe");
const simpleSource = require("@promistream/simple-source"); const simpleSource = require("@promistream/simple-source");
const query = ` const fetchQuery = `
SELECT
srap_items.*
FROM
srap_queue
LEFT JOIN srap_items
ON srap_items.id = srap_queue.item_id
WHERE
srap_queue.task = :task
AND srap_queue.started = FALSE
LIMIT :resultLimit
`;
const fillQuery = `
WITH WITH
dependency_tasks AS ( dependency_tasks AS (
SELECT * FROM SELECT * FROM
@ -14,7 +27,6 @@ const query = `
), ),
matching_items AS ( matching_items AS (
SELECT SELECT
DISTINCT ON (srap_items.id)
srap_items.*, srap_items.*,
results.updated_at AS result_date, results.updated_at AS result_date,
results.task_version, results.task_version,
@ -32,22 +44,19 @@ const query = `
LEFT JOIN srap_task_results AS results LEFT JOIN srap_task_results AS results
ON results.item_id = srap_items.id ON results.item_id = srap_items.id
AND results.task = :task AND results.task = :task
WHERE
NOT EXISTS (
SELECT FROM srap_tasks_in_progress AS pr WHERE pr.item_id = srap_items.id
)
), ),
candidates AS ( candidates AS (
SELECT * FROM matching_items SELECT * FROM matching_items
WHERE result_date IS NULL WHERE result_date IS NULL
UNION UNION ALL
SELECT * FROM matching_items SELECT * FROM matching_items
WHERE is_candidate = TRUE WHERE is_candidate = TRUE
OR NOT (task_version = :taskVersion) OR NOT (task_version = :taskVersion)
) )
( (
SELECT SELECT
* :task AS task,
id AS item_id
FROM FROM
candidates candidates
WHERE WHERE
@ -70,26 +79,53 @@ const query = `
) )
) )
) )
) LIMIT :resultLimit; )
`; `;
module.exports = function ({ metrics, backendSettings }) { module.exports = function ({ metrics, backendSettings, knex }) {
return function (tx, { task }) { return function (tx, { task }) {
let refillParameters = {
tags: task.tags,
task: task.name,
taskVersion: task.version,
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
// Case-mapping for SQL compatibility
return { task_version: dependency.version, task: dependency.name };
}))
};
let fetchParameters = {
task: task.name,
resultLimit: backendSettings.taskBatchSize
};
function refillQueue() {
let startTime = Date.now();
return Promise.try(() => {
// NOTE: We are deliberately bypassing the transaction here! Also deliberately not using VALUES, since we're inserting from the results of another query instead
return knex.raw(`
INSERT INTO srap_queue (task, item_id)
(${fillQuery})
ON CONFLICT (task, item_id) DO NOTHING;
`, refillParameters);
}).then((response) => {
let timeElapsed = Date.now() - startTime;
metrics.taskRefillTime.labels({ task: task.name }).set(timeElapsed / 1000);
metrics.taskRefillResults.labels({ task: task.name }).set(response.rowCount);
debug(`Queue for '${task.name}' was refilled with ${response.rowCount} items in ${timeElapsed}ms`);
return response.rowCount;
});
}
return pipe([ return pipe([
simpleSource(() => { simpleSource(() => {
let startTime = Date.now(); let startTime = Date.now();
return Promise.try(() => { return Promise.try(() => {
return tx.raw(query, { return tx.raw(fetchQuery, fetchParameters);
tags: task.tags,
task: task.name,
taskVersion: task.version,
resultLimit: backendSettings.taskBatchSize,
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
// Case-mapping for SQL compatibility
return { task_version: dependency.version, task: dependency.name };
}))
});
}).then((result) => { }).then((result) => {
let timeElapsed = Date.now() - startTime; let timeElapsed = Date.now() - startTime;
@ -101,8 +137,17 @@ module.exports = function ({ metrics, backendSettings }) {
if (result.rowCount > 0) { if (result.rowCount > 0) {
return result.rows; return result.rows;
} else { } else {
// TODO: Consider using LISTEN/NOTIFY instead? return Promise.try(() => {
return Promise.resolve([]).delay(backendSettings.taskBatchDelay); return refillQueue();
}).then((newItems) => {
if (newItems === 0) {
// TODO: Consider using LISTEN/NOTIFY instead? Worth the added complexity?
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
} else {
// Have another go right away
return [];
}
});
} }
}); });
}), }),

View file

@ -104,7 +104,7 @@ module.exports = function (state) {
return Promise.all([ return Promise.all([
this.getItem(tx, { id: from, optional: true }), this.getItem(tx, { id: from, optional: true }),
this.getItem(tx, { id: into, optional: true }), this.getItem(tx, { id: into, optional: true }),
]).then((fromObj, maybeIntoObj) => { ]).then(([ fromObj, maybeIntoObj ]) => {
if (fromObj != null) { if (fromObj != null) {
let intoObj = maybeIntoObj ?? { let intoObj = maybeIntoObj ?? {
id: into, id: into,

View file

@ -38,8 +38,19 @@ module.exports = function createPrometheus() {
name: "srap_task_fetch_results_count", name: "srap_task_fetch_results_count",
help: "Amount of new scraping tasks fetched during the most recent attempt", help: "Amount of new scraping tasks fetched during the most recent attempt",
labelNames: [ "task" ] labelNames: [ "task" ]
}),
taskRefillTime: new prometheusClient.Gauge({
registers: [ prometheusRegistry ],
name: "srap_task_refill_seconds",
help: "Time needed for the most recent refill of the task queue",
labelNames: [ "task" ]
}),
taskRefillResults: new prometheusClient.Gauge({
registers: [ prometheusRegistry ],
name: "srap_task_refill_results_count",
help: "Amount of new scraping tasks added to queue during the most recent attempt",
labelNames: [ "task" ]
}) })
// FIXME: Measure queue-refill task
} }
}; };
}; };