Refactor to pre-generate task queue for performance, fix some refactoring errors, fix some model bugs
This commit is contained in:
parent
fb93e902a8
commit
b9b0e63454
21
.vscode/launch.json
vendored
Normal file
21
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"type": "node",
|
||||||
|
"request": "launch",
|
||||||
|
"name": "Launch Program",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"program": "./bin/simulate",
|
||||||
|
"args": ["../seekseek/scraper-config/", "lcsc:normalizeProduct", "lcsc:product:C494972"],
|
||||||
|
"env": {
|
||||||
|
"DEBUG": "srap:backend:postgresql:*"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
19
migrations/20221123175316_queue-table.js
Normal file
19
migrations/20221123175316_queue-table.js
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
module.exports.up = function(knex, Promise) {
|
||||||
|
return knex.schema
|
||||||
|
.alterTable("srap_tasks_in_progress", (table) => {
|
||||||
|
table.timestamp("started_at").alter().nullable().defaultTo(null);
|
||||||
|
table.boolean("started").notNullable().defaultTo(false);
|
||||||
|
})
|
||||||
|
.renameTable("srap_tasks_in_progress", "srap_queue");
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.down = function(knex, Promise) {
|
||||||
|
return knex.schema
|
||||||
|
.renameTable("srap_queue", "srap_tasks_in_progress")
|
||||||
|
.alterTable("srap_tasks_in_progress", (table) => {
|
||||||
|
table.timestamp("started_at").alter().notNullable().defaultTo(knex.fn.now());
|
||||||
|
table.dropColumn("started");
|
||||||
|
});
|
||||||
|
};
|
|
@ -237,7 +237,7 @@ module.exports = function (state) {
|
||||||
});
|
});
|
||||||
|
|
||||||
return mutableOperation((tx) => {
|
return mutableOperation((tx) => {
|
||||||
return backend.moveItem(tx, { options, allowMerge: (options.merge != null) });
|
return backend.moveItem(tx, { ... options, allowMerge: (options.merge != null) });
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -85,10 +85,9 @@ module.exports = function(state) {
|
||||||
|
|
||||||
lock: function (tx, { id, task }) {
|
lock: function (tx, { id, task }) {
|
||||||
return Promise.try(() => {
|
return Promise.try(() => {
|
||||||
return db.TaskInProgress.query(tx).insert({
|
// FIXME: use Objection for this?
|
||||||
task: task.name,
|
// FIXME: Read item first to make it prevent edit conflicts via transaction
|
||||||
item_id: id
|
return tx.raw(`UPDATE srap_queue SET started = TRUE, started_at = NOW() WHERE task = :task AND item_id = :id`, { task: task.name, id: id });
|
||||||
});
|
|
||||||
}).then(() => {
|
}).then(() => {
|
||||||
return true;
|
return true;
|
||||||
}).catch({ name: "UniqueViolationError" }, () => {
|
}).catch({ name: "UniqueViolationError" }, () => {
|
||||||
|
@ -338,7 +337,7 @@ module.exports = function(state) {
|
||||||
|
|
||||||
countLockedTasks: function (tx) {
|
countLockedTasks: function (tx) {
|
||||||
return Promise.try(() => {
|
return Promise.try(() => {
|
||||||
return db.TaskInProgress.query(tx).count({ count: "*" });
|
return db.TaskInProgress.query(tx).count({ count: "*" }).where({ started: true });
|
||||||
}).then((result) => {
|
}).then((result) => {
|
||||||
return result[0].count;
|
return result[0].count;
|
||||||
});
|
});
|
||||||
|
|
|
@ -26,7 +26,7 @@ module.exports = function ({ db }) {
|
||||||
tasksInProgress: {
|
tasksInProgress: {
|
||||||
relation: Model.HasManyRelation,
|
relation: Model.HasManyRelation,
|
||||||
modelClass: db.TaskInProgress,
|
modelClass: db.TaskInProgress,
|
||||||
join: { from: "srap_items.id", to: "srap_tasksInProgress.itemId" }
|
join: { from: "srap_items.id", to: "srap_queue.itemId" }
|
||||||
},
|
},
|
||||||
failedTasks: {
|
failedTasks: {
|
||||||
// Not actually a many-to-many, but that's what objection calls a HasManyThrough...
|
// Not actually a many-to-many, but that's what objection calls a HasManyThrough...
|
||||||
|
@ -35,7 +35,7 @@ module.exports = function ({ db }) {
|
||||||
modelClass: db.Failure,
|
modelClass: db.Failure,
|
||||||
join: {
|
join: {
|
||||||
from: "srap_items.id",
|
from: "srap_items.id",
|
||||||
through: { from: "srap_task_results.itemId", to: "srap_task_results.id" },
|
through: { from: "srap_taskResults.itemId", to: "srap_taskResults.id" },
|
||||||
to: "srap_failures.taskResultId"
|
to: "srap_failures.taskResultId"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ module.exports = function ({ db }) {
|
||||||
item: {
|
item: {
|
||||||
relation: Model.BelongsToOneRelation,
|
relation: Model.BelongsToOneRelation,
|
||||||
modelClass: db.Item,
|
modelClass: db.Item,
|
||||||
join: { from: "srap_tags.itemId", to: "srap_item.id" }
|
join: { from: "srap_tags.itemId", to: "srap_items.id" }
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -4,7 +4,7 @@ const { Model } = require("objection");
|
||||||
|
|
||||||
module.exports = function ({ db }) {
|
module.exports = function ({ db }) {
|
||||||
return class TaskInProgress extends Model {
|
return class TaskInProgress extends Model {
|
||||||
static tableName = "srap_tasksInProgress";
|
static tableName = "srap_queue";
|
||||||
static idColumn = [ "task", "itemId" ];
|
static idColumn = [ "task", "itemId" ];
|
||||||
|
|
||||||
static get relationMappings() {
|
static get relationMappings() {
|
||||||
|
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
|
||||||
item: {
|
item: {
|
||||||
relation: Model.BelongsToOneRelation,
|
relation: Model.BelongsToOneRelation,
|
||||||
modelClass: db.Item,
|
modelClass: db.Item,
|
||||||
join: { from: "srap_tasksInProgress.itemId", to: "srap_item.id" }
|
join: { from: "srap_queue.itemId", to: "srap_items.id" }
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -12,7 +12,7 @@ module.exports = function ({ db }) {
|
||||||
item: {
|
item: {
|
||||||
relation: Model.BelongsToOneRelation,
|
relation: Model.BelongsToOneRelation,
|
||||||
modelClass: db.Item,
|
modelClass: db.Item,
|
||||||
join: { from: "srap_taskResults.itemId", to: "srap_item.id" }
|
join: { from: "srap_taskResults.itemId", to: "srap_items.id" }
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -6,7 +6,20 @@ const buffer = require("@promistream/buffer");
|
||||||
const pipe = require("@promistream/pipe");
|
const pipe = require("@promistream/pipe");
|
||||||
const simpleSource = require("@promistream/simple-source");
|
const simpleSource = require("@promistream/simple-source");
|
||||||
|
|
||||||
const query = `
|
const fetchQuery = `
|
||||||
|
SELECT
|
||||||
|
srap_items.*
|
||||||
|
FROM
|
||||||
|
srap_queue
|
||||||
|
LEFT JOIN srap_items
|
||||||
|
ON srap_items.id = srap_queue.item_id
|
||||||
|
WHERE
|
||||||
|
srap_queue.task = :task
|
||||||
|
AND srap_queue.started = FALSE
|
||||||
|
LIMIT :resultLimit
|
||||||
|
`;
|
||||||
|
|
||||||
|
const fillQuery = `
|
||||||
WITH
|
WITH
|
||||||
dependency_tasks AS (
|
dependency_tasks AS (
|
||||||
SELECT * FROM
|
SELECT * FROM
|
||||||
|
@ -14,7 +27,6 @@ const query = `
|
||||||
),
|
),
|
||||||
matching_items AS (
|
matching_items AS (
|
||||||
SELECT
|
SELECT
|
||||||
DISTINCT ON (srap_items.id)
|
|
||||||
srap_items.*,
|
srap_items.*,
|
||||||
results.updated_at AS result_date,
|
results.updated_at AS result_date,
|
||||||
results.task_version,
|
results.task_version,
|
||||||
|
@ -32,22 +44,19 @@ const query = `
|
||||||
LEFT JOIN srap_task_results AS results
|
LEFT JOIN srap_task_results AS results
|
||||||
ON results.item_id = srap_items.id
|
ON results.item_id = srap_items.id
|
||||||
AND results.task = :task
|
AND results.task = :task
|
||||||
WHERE
|
|
||||||
NOT EXISTS (
|
|
||||||
SELECT FROM srap_tasks_in_progress AS pr WHERE pr.item_id = srap_items.id
|
|
||||||
)
|
|
||||||
),
|
),
|
||||||
candidates AS (
|
candidates AS (
|
||||||
SELECT * FROM matching_items
|
SELECT * FROM matching_items
|
||||||
WHERE result_date IS NULL
|
WHERE result_date IS NULL
|
||||||
UNION
|
UNION ALL
|
||||||
SELECT * FROM matching_items
|
SELECT * FROM matching_items
|
||||||
WHERE is_candidate = TRUE
|
WHERE is_candidate = TRUE
|
||||||
OR NOT (task_version = :taskVersion)
|
OR NOT (task_version = :taskVersion)
|
||||||
)
|
)
|
||||||
(
|
(
|
||||||
SELECT
|
SELECT
|
||||||
*
|
:task AS task,
|
||||||
|
id AS item_id
|
||||||
FROM
|
FROM
|
||||||
candidates
|
candidates
|
||||||
WHERE
|
WHERE
|
||||||
|
@ -70,26 +79,53 @@ const query = `
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
) LIMIT :resultLimit;
|
)
|
||||||
`;
|
`;
|
||||||
|
|
||||||
module.exports = function ({ metrics, backendSettings }) {
|
module.exports = function ({ metrics, backendSettings, knex }) {
|
||||||
return function (tx, { task }) {
|
return function (tx, { task }) {
|
||||||
|
let refillParameters = {
|
||||||
|
tags: task.tags,
|
||||||
|
task: task.name,
|
||||||
|
taskVersion: task.version,
|
||||||
|
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
|
||||||
|
// Case-mapping for SQL compatibility
|
||||||
|
return { task_version: dependency.version, task: dependency.name };
|
||||||
|
}))
|
||||||
|
};
|
||||||
|
|
||||||
|
let fetchParameters = {
|
||||||
|
task: task.name,
|
||||||
|
resultLimit: backendSettings.taskBatchSize
|
||||||
|
};
|
||||||
|
|
||||||
|
function refillQueue() {
|
||||||
|
let startTime = Date.now();
|
||||||
|
|
||||||
|
return Promise.try(() => {
|
||||||
|
// NOTE: We are deliberately bypassing the transaction here! Also deliberately not using VALUES, since we're inserting from the results of another query instead
|
||||||
|
return knex.raw(`
|
||||||
|
INSERT INTO srap_queue (task, item_id)
|
||||||
|
(${fillQuery})
|
||||||
|
ON CONFLICT (task, item_id) DO NOTHING;
|
||||||
|
`, refillParameters);
|
||||||
|
}).then((response) => {
|
||||||
|
let timeElapsed = Date.now() - startTime;
|
||||||
|
|
||||||
|
metrics.taskRefillTime.labels({ task: task.name }).set(timeElapsed / 1000);
|
||||||
|
metrics.taskRefillResults.labels({ task: task.name }).set(response.rowCount);
|
||||||
|
|
||||||
|
debug(`Queue for '${task.name}' was refilled with ${response.rowCount} items in ${timeElapsed}ms`);
|
||||||
|
return response.rowCount;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return pipe([
|
return pipe([
|
||||||
simpleSource(() => {
|
simpleSource(() => {
|
||||||
let startTime = Date.now();
|
let startTime = Date.now();
|
||||||
|
|
||||||
return Promise.try(() => {
|
return Promise.try(() => {
|
||||||
return tx.raw(query, {
|
return tx.raw(fetchQuery, fetchParameters);
|
||||||
tags: task.tags,
|
|
||||||
task: task.name,
|
|
||||||
taskVersion: task.version,
|
|
||||||
resultLimit: backendSettings.taskBatchSize,
|
|
||||||
dependencyTaskDefinitions: JSON.stringify(task.dependencies.map((dependency) => {
|
|
||||||
// Case-mapping for SQL compatibility
|
|
||||||
return { task_version: dependency.version, task: dependency.name };
|
|
||||||
}))
|
|
||||||
});
|
|
||||||
}).then((result) => {
|
}).then((result) => {
|
||||||
let timeElapsed = Date.now() - startTime;
|
let timeElapsed = Date.now() - startTime;
|
||||||
|
|
||||||
|
@ -101,8 +137,17 @@ module.exports = function ({ metrics, backendSettings }) {
|
||||||
if (result.rowCount > 0) {
|
if (result.rowCount > 0) {
|
||||||
return result.rows;
|
return result.rows;
|
||||||
} else {
|
} else {
|
||||||
// TODO: Consider using LISTEN/NOTIFY instead?
|
return Promise.try(() => {
|
||||||
|
return refillQueue();
|
||||||
|
}).then((newItems) => {
|
||||||
|
if (newItems === 0) {
|
||||||
|
// TODO: Consider using LISTEN/NOTIFY instead? Worth the added complexity?
|
||||||
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
|
return Promise.resolve([]).delay(backendSettings.taskBatchDelay);
|
||||||
|
} else {
|
||||||
|
// Have another go right away
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}),
|
}),
|
||||||
|
|
|
@ -104,7 +104,7 @@ module.exports = function (state) {
|
||||||
return Promise.all([
|
return Promise.all([
|
||||||
this.getItem(tx, { id: from, optional: true }),
|
this.getItem(tx, { id: from, optional: true }),
|
||||||
this.getItem(tx, { id: into, optional: true }),
|
this.getItem(tx, { id: into, optional: true }),
|
||||||
]).then((fromObj, maybeIntoObj) => {
|
]).then(([ fromObj, maybeIntoObj ]) => {
|
||||||
if (fromObj != null) {
|
if (fromObj != null) {
|
||||||
let intoObj = maybeIntoObj ?? {
|
let intoObj = maybeIntoObj ?? {
|
||||||
id: into,
|
id: into,
|
||||||
|
|
|
@ -38,8 +38,19 @@ module.exports = function createPrometheus() {
|
||||||
name: "srap_task_fetch_results_count",
|
name: "srap_task_fetch_results_count",
|
||||||
help: "Amount of new scraping tasks fetched during the most recent attempt",
|
help: "Amount of new scraping tasks fetched during the most recent attempt",
|
||||||
labelNames: [ "task" ]
|
labelNames: [ "task" ]
|
||||||
|
}),
|
||||||
|
taskRefillTime: new prometheusClient.Gauge({
|
||||||
|
registers: [ prometheusRegistry ],
|
||||||
|
name: "srap_task_refill_seconds",
|
||||||
|
help: "Time needed for the most recent refill of the task queue",
|
||||||
|
labelNames: [ "task" ]
|
||||||
|
}),
|
||||||
|
taskRefillResults: new prometheusClient.Gauge({
|
||||||
|
registers: [ prometheusRegistry ],
|
||||||
|
name: "srap_task_refill_results_count",
|
||||||
|
help: "Amount of new scraping tasks added to queue during the most recent attempt",
|
||||||
|
labelNames: [ "task" ]
|
||||||
})
|
})
|
||||||
// FIXME: Measure queue-refill task
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue