Refactor to pre-generate task queue for performance, fix some refactoring errors, fix some model bugs

master
Sven Slootweg 1 year ago
parent fb93e902a8
commit b9b0e63454

@ -0,0 +1,21 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": [
"<node_internals>/**"
],
"program": "./bin/simulate",
"args": ["../seekseek/scraper-config/", "lcsc:normalizeProduct", "lcsc:product:C494972"],
"env": {
"DEBUG": "srap:backend:postgresql:*"
}
}
]
}

@ -0,0 +1,19 @@
"use strict";
module.exports.up = function(knex, Promise) {
return knex.schema
.alterTable("srap_tasks_in_progress", (table) => {
table.timestamp("started_at").alter().nullable().defaultTo(null);
table.boolean("started").notNullable().defaultTo(false);
})
.renameTable("srap_tasks_in_progress", "srap_queue");
};
module.exports.down = function(knex, Promise) {
return knex.schema
.renameTable("srap_queue", "srap_tasks_in_progress")
.alterTable("srap_tasks_in_progress", (table) => {
table.timestamp("started_at").alter().notNullable().defaultTo(knex.fn.now());
table.dropColumn("started");
});
};

@ -237,7 +237,7 @@ module.exports = function (state) {
});
return mutableOperation((tx) => {
return backend.moveItem(tx, { options, allowMerge: (options.merge != null) });
return backend.moveItem(tx, { ... options, allowMerge: (options.merge != null) });
});
},

@ -85,10 +85,9 @@ module.exports = function(state) {
lock: function (tx, { id, task }) {
return Promise.try(() => {
return db.TaskInProgress.query(tx).insert({
task: task.name,
item_id: id
});
// FIXME: use Objection for this?
// FIXME: Read item first to make it prevent edit conflicts via transaction
return tx.raw(`UPDATE srap_queue SET started = TRUE, started_at = NOW() WHERE task = :task AND item_id = :id`, { task: task.name, id: id });
}).then(() => {
return true;
}).catch({ name: "UniqueViolationError" }, () => {
@ -338,7 +337,7 @@ module.exports = function(state) {
countLockedTasks: function (tx) {
return Promise.try(() => {
return db.TaskInProgress.query(tx).count({ count: "*" });
return db.TaskInProgress.query(tx).count({ count: "*" }).where({ started: true });
}).then((result) => {
return result[0].count;
});

@ -26,7 +26,7 @@ module.exports = function ({ db }) {
tasksInProgress: {
relation: Model.HasManyRelation,
modelClass: db.TaskInProgress,
join: { from: "srap_items.id", to: "srap_tasksInProgress.itemId" }
join: { from: "srap_items.id", to: "srap_queue.itemId" }
},
failedTasks: {
// Not actually a many-to-many, but that's what objection calls a HasManyThrough...
@ -35,7 +35,7 @@ module.exports = function ({ db }) {
modelClass: db.Failure,
join: {
from: "srap_items.id",
through: { from: "srap_task_results.itemId", to: "srap_task_results.id" },
through: { from: "srap_taskResults.itemId", to: "srap_taskResults.id" },
to: "srap_failures.taskResultId"
}
}

@ -11,7 +11,7 @@ module.exports = function ({ db }) {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "srap_tags.itemId", to: "srap_item.id" }
join: { from: "srap_tags.itemId", to: "srap_items.id" }
}
};
};

@ -4,7 +4,7 @@ const { Model } = require("objection");
module.exports = function ({ db }) {
return class TaskInProgress extends Model {
static tableName = "srap_tasksInProgress";
static tableName = "srap_queue";
static idColumn = [ "task", "itemId" ];
static get relationMappings() {
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "srap_tasksInProgress.itemId", to: "srap_item.id" }
join: { from: "srap_queue.itemId", to: "srap_items.id" }
}
};
};

@ -12,7 +12,7 @@ module.exports = function ({ db }) {
item: {
relation: Model.BelongsToOneRelation,
modelClass: db.Item,
join: { from: "srap_taskResults.itemId", to: "srap_item.id" }
join: { from: "srap_taskResults.itemId", to: "srap_items.id" }
}
};
};

@ -6,7 +6,20 @@ const buffer = require("@promistream/buffer");
const pipe = require("@promistream/pipe");
const simpleSource = require("@promistream/simple-source");
const query = `
const fetchQuery = `
SELECT
srap_items.*
FROM
srap_queue
LEFT JOIN srap_items
ON srap_items.id = srap_queue.item_id
WHERE
srap_queue.task = :task
AND srap_queue.started = FALSE
LIMIT :resultLimit
`;
const fillQuery = `
WITH
dependency_tasks AS (
SELECT * FROM
@ -14,7 +27,6 @@ const query = `
),
matching_items AS (
SELECT
DISTINCT ON (srap_items.id)
srap_items.*,
results.updated_at AS result_date,
results.task_version,
@ -32,22 +44,19 @@ const query = `
LEFT JOIN srap_task_results AS results
ON results.item_id = srap_items.id
AND results.task = :task
WHERE
NOT EXISTS (
SELECT FROM srap_tasks_in_progress AS pr WHERE pr.item_id = srap_items.id
)
),
candidates AS (
SELECT * FROM matching_items
WHERE result_date IS NULL
UNION
UNION ALL
SELECT * FROM matching_items
WHERE is_candidate = TRUE
OR NOT (task_version = :taskVersion)
)
(
SELECT
*
:task AS task,
id AS item_id
FROM
candidates
WHERE
@ -70,26 +79,53 @@ const query = `
)
)
)
) LIMIT :resultLimit;
)
`;
module.exports = function ({ metrics, backendSettings }) {
module.exports = function ({ metrics, backendSettings, knex }) {
return function (tx, { task }) {
let refillParameters = {
tags: task.tags,
task: task.name,
taskVersion: task.version,
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
// Case-mapping for SQL compatibility
return { task_version: dependency.version, task: dependency.name };
}))
};
let fetchParameters = {
task: task.name,
resultLimit: backendSettings.taskBatchSize
};
function refillQueue() {
let startTime = Date.now();
return Promise.try(() => {
// NOTE: We are deliberately bypassing the transaction here! Also deliberately not using VALUES, since we're inserting from the results of another query instead
return knex.raw(`
INSERT INTO srap_queue (task, item_id)
(${fillQuery})
ON CONFLICT (task, item_id) DO NOTHING;
`, refillParameters);
}).then((response) => {
let timeElapsed = Date.now() - startTime;
metrics.taskRefillTime.labels({ task: task.name }).set(timeElapsed / 1000);
metrics.taskRefillResults.labels({ task: task.name }).set(response.rowCount);
debug(`Queue for '${task.name}' was refilled with ${response.rowCount} items in ${timeElapsed}ms`);
return response.rowCount;
});
}
return pipe([
simpleSource(() => {
let startTime = Date.now();
return Promise.try(() => {
return tx.raw(query, {
tags: task.tags,
task: task.name,
taskVersion: task.version,
resultLimit: backendSettings.taskBatchSize,
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
// Case-mapping for SQL compatibility
return { task_version: dependency.version, task: dependency.name };
}))
});
return tx.raw(fetchQuery, fetchParameters);
}).then((result) => {
let timeElapsed = Date.now() - startTime;
@ -101,8 +137,17 @@ module.exports = function ({ metrics, backendSettings }) {
if (result.rowCount > 0) {
return result.rows;
} else {
// TODO: Consider using LISTEN/NOTIFY instead?
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
return Promise.try(() => {
return refillQueue();
}).then((newItems) => {
if (newItems === 0) {
// TODO: Consider using LISTEN/NOTIFY instead? Worth the added complexity?
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
} else {
// Have another go right away
return [];
}
});
}
});
}),

@ -104,7 +104,7 @@ module.exports = function (state) {
return Promise.all([
this.getItem(tx, { id: from, optional: true }),
this.getItem(tx, { id: into, optional: true }),
]).then((fromObj, maybeIntoObj) => {
]).then(([ fromObj, maybeIntoObj ]) => {
if (fromObj != null) {
let intoObj = maybeIntoObj ?? {
id: into,

@ -38,8 +38,19 @@ module.exports = function createPrometheus() {
name: "srap_task_fetch_results_count",
help: "Amount of new scraping tasks fetched during the most recent attempt",
labelNames: [ "task" ]
}),
taskRefillTime: new prometheusClient.Gauge({
registers: [ prometheusRegistry ],
name: "srap_task_refill_seconds",
help: "Time needed for the most recent refill of the task queue",
labelNames: [ "task" ]
}),
taskRefillResults: new prometheusClient.Gauge({
registers: [ prometheusRegistry ],
name: "srap_task_refill_results_count",
help: "Amount of new scraping tasks added to queue during the most recent attempt",
labelNames: [ "task" ]
})
// FIXME: Measure queue-refill task
}
};
};

Loading…
Cancel
Save