Refactor to pre-generate task queue for performance, fix some refactoring errors, fix some model bugs
This commit is contained in:
parent
fb93e902a8
commit
b9b0e63454
21
.vscode/launch.json
vendored
Normal file
21
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"name": "Launch Program",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"program": "./bin/simulate",
|
||||
"args": ["../seekseek/scraper-config/", "lcsc:normalizeProduct", "lcsc:product:C494972"],
|
||||
"env": {
|
||||
"DEBUG": "srap:backend:postgresql:*"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
19
migrations/20221123175316_queue-table.js
Normal file
19
migrations/20221123175316_queue-table.js
Normal file
|
@ -0,0 +1,19 @@
|
|||
"use strict";
|
||||
|
||||
module.exports.up = function(knex, Promise) {
|
||||
return knex.schema
|
||||
.alterTable("srap_tasks_in_progress", (table) => {
|
||||
table.timestamp("started_at").alter().nullable().defaultTo(null);
|
||||
table.boolean("started").notNullable().defaultTo(false);
|
||||
})
|
||||
.renameTable("srap_tasks_in_progress", "srap_queue");
|
||||
};
|
||||
|
||||
module.exports.down = function(knex, Promise) {
|
||||
return knex.schema
|
||||
.renameTable("srap_queue", "srap_tasks_in_progress")
|
||||
.alterTable("srap_tasks_in_progress", (table) => {
|
||||
table.timestamp("started_at").alter().notNullable().defaultTo(knex.fn.now());
|
||||
table.dropColumn("started");
|
||||
});
|
||||
};
|
|
@ -237,7 +237,7 @@ module.exports = function (state) {
|
|||
});
|
||||
|
||||
return mutableOperation((tx) => {
|
||||
return backend.moveItem(tx, { options, allowMerge: (options.merge != null) });
|
||||
return backend.moveItem(tx, { ... options, allowMerge: (options.merge != null) });
|
||||
});
|
||||
},
|
||||
|
||||
|
|
|
@ -85,10 +85,9 @@ module.exports = function(state) {
|
|||
|
||||
lock: function (tx, { id, task }) {
|
||||
return Promise.try(() => {
|
||||
return db.TaskInProgress.query(tx).insert({
|
||||
task: task.name,
|
||||
item_id: id
|
||||
});
|
||||
// FIXME: use Objection for this?
|
||||
// FIXME: Read item first to make it prevent edit conflicts via transaction
|
||||
return tx.raw(`UPDATE srap_queue SET started = TRUE, started_at = NOW() WHERE task = :task AND item_id = :id`, { task: task.name, id: id });
|
||||
}).then(() => {
|
||||
return true;
|
||||
}).catch({ name: "UniqueViolationError" }, () => {
|
||||
|
@ -338,7 +337,7 @@ module.exports = function(state) {
|
|||
|
||||
countLockedTasks: function (tx) {
|
||||
return Promise.try(() => {
|
||||
return db.TaskInProgress.query(tx).count({ count: "*" });
|
||||
return db.TaskInProgress.query(tx).count({ count: "*" }).where({ started: true });
|
||||
}).then((result) => {
|
||||
return result[0].count;
|
||||
});
|
||||
|
|
|
@ -26,7 +26,7 @@ module.exports = function ({ db }) {
|
|||
tasksInProgress: {
|
||||
relation: Model.HasManyRelation,
|
||||
modelClass: db.TaskInProgress,
|
||||
join: { from: "srap_items.id", to: "srap_tasksInProgress.itemId" }
|
||||
join: { from: "srap_items.id", to: "srap_queue.itemId" }
|
||||
},
|
||||
failedTasks: {
|
||||
// Not actually a many-to-many, but that's what objection calls a HasManyThrough...
|
||||
|
@ -35,7 +35,7 @@ module.exports = function ({ db }) {
|
|||
modelClass: db.Failure,
|
||||
join: {
|
||||
from: "srap_items.id",
|
||||
through: { from: "srap_task_results.itemId", to: "srap_task_results.id" },
|
||||
through: { from: "srap_taskResults.itemId", to: "srap_taskResults.id" },
|
||||
to: "srap_failures.taskResultId"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ module.exports = function ({ db }) {
|
|||
item: {
|
||||
relation: Model.BelongsToOneRelation,
|
||||
modelClass: db.Item,
|
||||
join: { from: "srap_tags.itemId", to: "srap_item.id" }
|
||||
join: { from: "srap_tags.itemId", to: "srap_items.id" }
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
|
@ -4,7 +4,7 @@ const { Model } = require("objection");
|
|||
|
||||
module.exports = function ({ db }) {
|
||||
return class TaskInProgress extends Model {
|
||||
static tableName = "srap_tasksInProgress";
|
||||
static tableName = "srap_queue";
|
||||
static idColumn = [ "task", "itemId" ];
|
||||
|
||||
static get relationMappings() {
|
||||
|
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
|
|||
item: {
|
||||
relation: Model.BelongsToOneRelation,
|
||||
modelClass: db.Item,
|
||||
join: { from: "srap_tasksInProgress.itemId", to: "srap_item.id" }
|
||||
join: { from: "srap_queue.itemId", to: "srap_items.id" }
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
|
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
|
|||
item: {
|
||||
relation: Model.BelongsToOneRelation,
|
||||
modelClass: db.Item,
|
||||
join: { from: "srap_taskResults.itemId", to: "srap_item.id" }
|
||||
join: { from: "srap_taskResults.itemId", to: "srap_items.id" }
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
|
@ -6,7 +6,20 @@ const buffer = require("@promistream/buffer");
|
|||
const pipe = require("@promistream/pipe");
|
||||
const simpleSource = require("@promistream/simple-source");
|
||||
|
||||
const query = `
|
||||
const fetchQuery = `
|
||||
SELECT
|
||||
srap_items.*
|
||||
FROM
|
||||
srap_queue
|
||||
LEFT JOIN srap_items
|
||||
ON srap_items.id = srap_queue.item_id
|
||||
WHERE
|
||||
srap_queue.task = :task
|
||||
AND srap_queue.started = FALSE
|
||||
LIMIT :resultLimit
|
||||
`;
|
||||
|
||||
const fillQuery = `
|
||||
WITH
|
||||
dependency_tasks AS (
|
||||
SELECT * FROM
|
||||
|
@ -14,7 +27,6 @@ const query = `
|
|||
),
|
||||
matching_items AS (
|
||||
SELECT
|
||||
DISTINCT ON (srap_items.id)
|
||||
srap_items.*,
|
||||
results.updated_at AS result_date,
|
||||
results.task_version,
|
||||
|
@ -32,22 +44,19 @@ const query = `
|
|||
LEFT JOIN srap_task_results AS results
|
||||
ON results.item_id = srap_items.id
|
||||
AND results.task = :task
|
||||
WHERE
|
||||
NOT EXISTS (
|
||||
SELECT FROM srap_tasks_in_progress AS pr WHERE pr.item_id = srap_items.id
|
||||
)
|
||||
),
|
||||
candidates AS (
|
||||
SELECT * FROM matching_items
|
||||
WHERE result_date IS NULL
|
||||
UNION
|
||||
UNION ALL
|
||||
SELECT * FROM matching_items
|
||||
WHERE is_candidate = TRUE
|
||||
OR NOT (task_version = :taskVersion)
|
||||
)
|
||||
(
|
||||
SELECT
|
||||
*
|
||||
:task AS task,
|
||||
id AS item_id
|
||||
FROM
|
||||
candidates
|
||||
WHERE
|
||||
|
@ -70,26 +79,53 @@ const query = `
|
|||
)
|
||||
)
|
||||
)
|
||||
) LIMIT :resultLimit;
|
||||
)
|
||||
`;
|
||||
|
||||
module.exports = function ({ metrics, backendSettings }) {
|
||||
module.exports = function ({ metrics, backendSettings, knex }) {
|
||||
return function (tx, { task }) {
|
||||
let refillParameters = {
|
||||
tags: task.tags,
|
||||
task: task.name,
|
||||
taskVersion: task.version,
|
||||
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
|
||||
// Case-mapping for SQL compatibility
|
||||
return { task_version: dependency.version, task: dependency.name };
|
||||
}))
|
||||
};
|
||||
|
||||
let fetchParameters = {
|
||||
task: task.name,
|
||||
resultLimit: backendSettings.taskBatchSize
|
||||
};
|
||||
|
||||
function refillQueue() {
|
||||
let startTime = Date.now();
|
||||
|
||||
return Promise.try(() => {
|
||||
// NOTE: We are deliberately bypassing the transaction here! Also deliberately not using VALUES, since we're inserting from the results of another query instead
|
||||
return knex.raw(`
|
||||
INSERT INTO srap_queue (task, item_id)
|
||||
(${fillQuery})
|
||||
ON CONFLICT (task, item_id) DO NOTHING;
|
||||
`, refillParameters);
|
||||
}).then((response) => {
|
||||
let timeElapsed = Date.now() - startTime;
|
||||
|
||||
metrics.taskRefillTime.labels({ task: task.name }).set(timeElapsed / 1000);
|
||||
metrics.taskRefillResults.labels({ task: task.name }).set(response.rowCount);
|
||||
|
||||
debug(`Queue for '${task.name}' was refilled with ${response.rowCount} items in ${timeElapsed}ms`);
|
||||
return response.rowCount;
|
||||
});
|
||||
}
|
||||
|
||||
return pipe([
|
||||
simpleSource(() => {
|
||||
let startTime = Date.now();
|
||||
|
||||
return Promise.try(() => {
|
||||
return tx.raw(query, {
|
||||
tags: task.tags,
|
||||
task: task.name,
|
||||
taskVersion: task.version,
|
||||
resultLimit: backendSettings.taskBatchSize,
|
||||
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
|
||||
// Case-mapping for SQL compatibility
|
||||
return { task_version: dependency.version, task: dependency.name };
|
||||
}))
|
||||
});
|
||||
return tx.raw(fetchQuery, fetchParameters);
|
||||
}).then((result) => {
|
||||
let timeElapsed = Date.now() - startTime;
|
||||
|
||||
|
@ -101,8 +137,17 @@ module.exports = function ({ metrics, backendSettings }) {
|
|||
if (result.rowCount > 0) {
|
||||
return result.rows;
|
||||
} else {
|
||||
// TODO: Consider using LISTEN/NOTIFY instead?
|
||||
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
|
||||
return Promise.try(() => {
|
||||
return refillQueue();
|
||||
}).then((newItems) => {
|
||||
if (newItems === 0) {
|
||||
// TODO: Consider using LISTEN/NOTIFY instead? Worth the added complexity?
|
||||
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
|
||||
} else {
|
||||
// Have another go right away
|
||||
return [];
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}),
|
||||
|
|
|
@ -104,7 +104,7 @@ module.exports = function (state) {
|
|||
return Promise.all([
|
||||
this.getItem(tx, { id: from, optional: true }),
|
||||
this.getItem(tx, { id: into, optional: true }),
|
||||
]).then((fromObj, maybeIntoObj) => {
|
||||
]).then(([ fromObj, maybeIntoObj ]) => {
|
||||
if (fromObj != null) {
|
||||
let intoObj = maybeIntoObj ?? {
|
||||
id: into,
|
||||
|
|
|
@ -38,8 +38,19 @@ module.exports = function createPrometheus() {
|
|||
name: "srap_task_fetch_results_count",
|
||||
help: "Amount of new scraping tasks fetched during the most recent attempt",
|
||||
labelNames: [ "task" ]
|
||||
}),
|
||||
taskRefillTime: new prometheusClient.Gauge({
|
||||
registers: [ prometheusRegistry ],
|
||||
name: "srap_task_refill_seconds",
|
||||
help: "Time needed for the most recent refill of the task queue",
|
||||
labelNames: [ "task" ]
|
||||
}),
|
||||
taskRefillResults: new prometheusClient.Gauge({
|
||||
registers: [ prometheusRegistry ],
|
||||
name: "srap_task_refill_results_count",
|
||||
help: "Amount of new scraping tasks added to queue during the most recent attempt",
|
||||
labelNames: [ "task" ]
|
||||
})
|
||||
// FIXME: Measure queue-refill task
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue