runtime: replace virtual checkpoint timer with watchdog
Some checks failed
build / Linux asan (push) Has been cancelled
build / Linux release (push) Has been cancelled
build / FreeBSD build (push) Has been cancelled

This commit is contained in:
server
2026-04-14 07:40:07 +02:00
parent cecc822777
commit e638816026
4 changed files with 150 additions and 43 deletions

View File

@@ -88,6 +88,7 @@ int thecore_idle(void)
void thecore_destroy(void) void thecore_destroy(void)
{ {
signal_destroy();
pid_deinit(); pid_deinit();
log_destroy(); log_destroy();
} }

View File

@@ -1,10 +1,100 @@
#include "stdafx.h" #include "stdafx.h"
#include <atomic> #include <atomic>
#include <chrono>
#include <condition_variable>
#include <mutex>
#include <thread>
namespace namespace
{ {
std::atomic<int> s_checkpoint_ticks { 0 }; std::atomic<uint64_t> s_checkpoint_progress { 0 };
#ifndef OS_WINDOWS
std::mutex s_checkpoint_mutex;
std::condition_variable s_checkpoint_cv;
std::thread s_checkpoint_thread;
bool s_checkpoint_shutdown = false;
bool s_checkpoint_enabled = false;
int s_checkpoint_timeout_seconds = 0;
uint64_t s_checkpoint_generation = 0;
void checkpoint_watchdog_loop()
{
uint64_t last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
auto last_change = std::chrono::steady_clock::now();
uint64_t observed_generation = 0;
std::unique_lock<std::mutex> lock(s_checkpoint_mutex);
while (!s_checkpoint_shutdown)
{
if (!s_checkpoint_enabled || s_checkpoint_timeout_seconds <= 0)
{
s_checkpoint_cv.wait(lock, []()
{
return s_checkpoint_shutdown || (s_checkpoint_enabled && s_checkpoint_timeout_seconds > 0);
});
last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
last_change = std::chrono::steady_clock::now();
observed_generation = s_checkpoint_generation;
continue;
}
const int timeout_seconds = s_checkpoint_timeout_seconds;
const uint64_t generation = s_checkpoint_generation;
const auto poll_interval = std::chrono::seconds(1);
const bool reconfigured = s_checkpoint_cv.wait_for(lock, poll_interval, [generation]()
{
return s_checkpoint_shutdown || s_checkpoint_generation != generation;
});
if (s_checkpoint_shutdown)
break;
if (reconfigured || observed_generation != s_checkpoint_generation)
{
last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
last_change = std::chrono::steady_clock::now();
observed_generation = s_checkpoint_generation;
continue;
}
const uint64_t current_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
const auto now = std::chrono::steady_clock::now();
if (current_progress != last_progress)
{
last_progress = current_progress;
last_change = now;
continue;
}
if (now - last_change >= std::chrono::seconds(timeout_seconds))
{
lock.unlock();
sys_err("CHECKPOINT shutdown: no progress observed for %d seconds.", timeout_seconds);
abort();
}
}
}
void checkpoint_ensure_started()
{
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
if (s_checkpoint_thread.joinable())
return;
s_checkpoint_shutdown = false;
s_checkpoint_enabled = false;
s_checkpoint_timeout_seconds = 0;
s_checkpoint_generation = 0;
s_checkpoint_thread = std::thread(checkpoint_watchdog_loop);
}
#endif
const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend) const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
{ {
@@ -14,6 +104,8 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
return "none"; return "none";
case CHECKPOINT_BACKEND_VIRTUAL_TIMER: case CHECKPOINT_BACKEND_VIRTUAL_TIMER:
return "virtual-timer"; return "virtual-timer";
case CHECKPOINT_BACKEND_WATCHDOG_THREAD:
return "watchdog-thread";
default: default:
return "unknown"; return "unknown";
} }
@@ -22,6 +114,7 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
void signal_setup() {} void signal_setup() {}
void signal_destroy() {}
void signal_timer_disable() {} void signal_timer_disable() {}
void signal_timer_enable(int timeout_seconds) {} void signal_timer_enable(int timeout_seconds) {}
void signal_mark_progress() {} void signal_mark_progress() {}
@@ -37,18 +130,6 @@ RETSIGTYPE reap(int sig)
} }
RETSIGTYPE checkpointing(int sig)
{
if (!s_checkpoint_ticks.load())
{
sys_err("CHECKPOINT shutdown: tics did not updated.");
abort();
}
else
s_checkpoint_ticks.store(0);
}
RETSIGTYPE hupsig(int sig) RETSIGTYPE hupsig(int sig)
{ {
shutdowned = TRUE; shutdowned = TRUE;
@@ -62,56 +143,79 @@ RETSIGTYPE usrsig(int sig)
void signal_timer_disable(void) void signal_timer_disable(void)
{ {
struct itimerval itime; checkpoint_ensure_started();
struct timeval interval;
interval.tv_sec = 0; {
interval.tv_usec = 0; std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
s_checkpoint_enabled = false;
s_checkpoint_timeout_seconds = 0;
++s_checkpoint_generation;
}
itime.it_interval = interval; s_checkpoint_cv.notify_all();
itime.it_value = interval;
setitimer(ITIMER_VIRTUAL, &itime, NULL);
} }
void signal_timer_enable(int sec) void signal_timer_enable(int sec)
{ {
struct itimerval itime; checkpoint_ensure_started();
struct timeval interval;
interval.tv_sec = sec; {
interval.tv_usec = 0; std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
s_checkpoint_enabled = sec > 0;
s_checkpoint_timeout_seconds = sec;
++s_checkpoint_generation;
}
itime.it_interval = interval; s_checkpoint_cv.notify_all();
itime.it_value = interval;
setitimer(ITIMER_VIRTUAL, &itime, NULL);
} }
void signal_setup(void) void signal_setup(void)
{ {
signal_timer_enable(30); checkpoint_ensure_started();
signal_timer_enable(30);
signal(SIGVTALRM, checkpointing); /* just to be on the safe side: */
signal(SIGHUP, hupsig);
signal(SIGCHLD, reap);
signal(SIGINT, hupsig);
signal(SIGTERM, hupsig);
signal(SIGPIPE, SIG_IGN);
signal(SIGALRM, SIG_IGN);
signal(SIGUSR1, usrsig);
/* just to be on the safe side: */ sys_log(0, "[STARTUP] checkpoint backend=%s", signal_checkpoint_backend_name(signal_checkpoint_backend()));
signal(SIGHUP, hupsig); }
signal(SIGCHLD, reap);
signal(SIGINT, hupsig); void signal_destroy()
signal(SIGTERM, hupsig); {
signal(SIGPIPE, SIG_IGN); std::thread checkpoint_thread;
signal(SIGALRM, SIG_IGN);
signal(SIGUSR1, usrsig); {
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
if (!s_checkpoint_thread.joinable())
return;
s_checkpoint_shutdown = true;
s_checkpoint_enabled = false;
s_checkpoint_timeout_seconds = 0;
++s_checkpoint_generation;
checkpoint_thread = std::move(s_checkpoint_thread);
}
s_checkpoint_cv.notify_all();
checkpoint_thread.join();
s_checkpoint_progress.store(0, std::memory_order_relaxed);
} }
void signal_mark_progress() void signal_mark_progress()
{ {
s_checkpoint_ticks.fetch_add(1, std::memory_order_relaxed); s_checkpoint_progress.fetch_add(1, std::memory_order_relaxed);
} }
ECheckpointBackend signal_checkpoint_backend() ECheckpointBackend signal_checkpoint_backend()
{ {
return CHECKPOINT_BACKEND_VIRTUAL_TIMER; return CHECKPOINT_BACKEND_WATCHDOG_THREAD;
} }
const char* signal_checkpoint_backend_name(ECheckpointBackend backend) const char* signal_checkpoint_backend_name(ECheckpointBackend backend)

View File

@@ -4,9 +4,11 @@ enum ECheckpointBackend
{ {
CHECKPOINT_BACKEND_NONE = 0, CHECKPOINT_BACKEND_NONE = 0,
CHECKPOINT_BACKEND_VIRTUAL_TIMER = 1, CHECKPOINT_BACKEND_VIRTUAL_TIMER = 1,
CHECKPOINT_BACKEND_WATCHDOG_THREAD = 2,
}; };
void signal_setup(); void signal_setup();
void signal_destroy();
void signal_timer_disable(); void signal_timer_disable();
void signal_timer_enable(int timeout_seconds); void signal_timer_enable(int timeout_seconds);
void signal_mark_progress(); void signal_mark_progress();

View File

@@ -317,8 +317,8 @@ void TestCheckpointBackendMetadata()
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "none") == 0, Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "none") == 0,
"Unexpected checkpoint backend name on Windows"); "Unexpected checkpoint backend name on Windows");
#else #else
Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_VIRTUAL_TIMER, "Expected virtual timer checkpoint backend"); Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_WATCHDOG_THREAD, "Expected watchdog thread checkpoint backend");
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "virtual-timer") == 0, Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "watchdog-thread") == 0,
"Unexpected checkpoint backend name"); "Unexpected checkpoint backend name");
#endif #endif
} }