114 lines
3.7 KiB
Lua
114 lines
3.7 KiB
Lua
--[[
|
|
Move stalled jobs to wait.
|
|
|
|
Input:
|
|
KEYS[1] 'stalled' (SET)
|
|
KEYS[2] 'wait', (LIST)
|
|
KEYS[3] 'active', (LIST)
|
|
KEYS[4] 'stalled-check', (KEY)
|
|
KEYS[5] 'meta', (KEY)
|
|
KEYS[6] 'paused', (LIST)
|
|
KEYS[7] 'marker'
|
|
KEYS[8] 'event stream' (STREAM)
|
|
|
|
ARGV[1] Max stalled job count
|
|
ARGV[2] queue.toKey('')
|
|
ARGV[3] timestamp
|
|
ARGV[4] max check time
|
|
|
|
Events:
|
|
'stalled' with stalled job id.
|
|
]]
|
|
local rcall = redis.call
|
|
|
|
-- Includes
|
|
--- @include "includes/addJobInTargetList"
|
|
--- @include "includes/batches"
|
|
--- @include "includes/moveJobToWait"
|
|
--- @include "includes/trimEvents"
|
|
|
|
local stalledKey = KEYS[1]
|
|
local waitKey = KEYS[2]
|
|
local activeKey = KEYS[3]
|
|
local stalledCheckKey = KEYS[4]
|
|
local metaKey = KEYS[5]
|
|
local pausedKey = KEYS[6]
|
|
local markerKey = KEYS[7]
|
|
local eventStreamKey = KEYS[8]
|
|
local maxStalledJobCount = tonumber(ARGV[1])
|
|
local queueKeyPrefix = ARGV[2]
|
|
local timestamp = ARGV[3]
|
|
local maxCheckTime = ARGV[4]
|
|
|
|
if rcall("EXISTS", stalledCheckKey) == 1 then
|
|
return {}
|
|
end
|
|
|
|
rcall("SET", stalledCheckKey, timestamp, "PX", maxCheckTime)
|
|
|
|
-- Trim events before emiting them to avoid trimming events emitted in this script
|
|
trimEvents(metaKey, eventStreamKey)
|
|
|
|
-- Move all stalled jobs to wait
|
|
local stalling = rcall('SMEMBERS', stalledKey)
|
|
local stalled = {}
|
|
if (#stalling > 0) then
|
|
rcall('DEL', stalledKey)
|
|
|
|
-- Remove from active list
|
|
for i, jobId in ipairs(stalling) do
|
|
-- Markers in waitlist DEPRECATED in v5: Remove in v6.
|
|
if string.sub(jobId, 1, 2) == "0:" then
|
|
-- If the jobId is a delay marker ID we just remove it.
|
|
rcall("LREM", activeKey, 1, jobId)
|
|
else
|
|
local jobKey = queueKeyPrefix .. jobId
|
|
|
|
-- Check that the lock is also missing, then we can handle this job as really stalled.
|
|
if (rcall("EXISTS", jobKey .. ":lock") == 0) then
|
|
-- Remove from the active queue.
|
|
local removed = rcall("LREM", activeKey, 1, jobId)
|
|
|
|
if (removed > 0) then
|
|
-- If this job has been stalled too many times, such as if it crashes the worker, then fail it.
|
|
local stalledCount = rcall("HINCRBY", jobKey, "stc", 1)
|
|
|
|
-- Check if this is a repeatable job by looking at job options
|
|
local jobOpts = rcall("HGET", jobKey, "opts")
|
|
local isRepeatableJob = false
|
|
if jobOpts then
|
|
local opts = cjson.decode(jobOpts)
|
|
if opts and opts["repeat"] then
|
|
isRepeatableJob = true
|
|
end
|
|
end
|
|
|
|
-- Only fail job if it exceeds stall limit AND is not a repeatable job
|
|
if stalledCount > maxStalledJobCount and not isRepeatableJob then
|
|
local failedReason = "job stalled more than allowable limit"
|
|
rcall("HSET", jobKey, "defa", failedReason)
|
|
end
|
|
|
|
moveJobToWait(metaKey, activeKey, waitKey, pausedKey, markerKey, eventStreamKey, jobId,
|
|
"RPUSH")
|
|
|
|
-- Emit the stalled event
|
|
rcall("XADD", eventStreamKey, "*", "event", "stalled", "jobId", jobId)
|
|
table.insert(stalled, jobId)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
-- Mark potentially stalled jobs
|
|
local active = rcall('LRANGE', activeKey, 0, -1)
|
|
|
|
if (#active > 0) then
|
|
for from, to in batches(#active, 7000) do
|
|
rcall('SADD', stalledKey, unpack(active, from, to))
|
|
end
|
|
end
|
|
|
|
return stalled
|