From e638816026fd9b96a810d0cb3d2fe02f9c46a344 Mon Sep 17 00:00:00 2001 From: server Date: Tue, 14 Apr 2026 07:40:07 +0200 Subject: [PATCH] runtime: replace virtual checkpoint timer with watchdog --- src/libthecore/main.cpp | 1 + src/libthecore/signal.cpp | 186 +++++++++++++++++++++++++++++--------- src/libthecore/signal.h | 2 + tests/smoke_auth.cpp | 4 +- 4 files changed, 150 insertions(+), 43 deletions(-) diff --git a/src/libthecore/main.cpp b/src/libthecore/main.cpp index b31b540..ef0ffa6 100644 --- a/src/libthecore/main.cpp +++ b/src/libthecore/main.cpp @@ -88,6 +88,7 @@ int thecore_idle(void) void thecore_destroy(void) { + signal_destroy(); pid_deinit(); log_destroy(); } diff --git a/src/libthecore/signal.cpp b/src/libthecore/signal.cpp index 24167ab..5e0aa7a 100644 --- a/src/libthecore/signal.cpp +++ b/src/libthecore/signal.cpp @@ -1,10 +1,100 @@ #include "stdafx.h" #include +#include +#include +#include +#include namespace { -std::atomic s_checkpoint_ticks { 0 }; +std::atomic s_checkpoint_progress { 0 }; + +#ifndef OS_WINDOWS +std::mutex s_checkpoint_mutex; +std::condition_variable s_checkpoint_cv; +std::thread s_checkpoint_thread; +bool s_checkpoint_shutdown = false; +bool s_checkpoint_enabled = false; +int s_checkpoint_timeout_seconds = 0; +uint64_t s_checkpoint_generation = 0; + +void checkpoint_watchdog_loop() +{ + uint64_t last_progress = s_checkpoint_progress.load(std::memory_order_relaxed); + auto last_change = std::chrono::steady_clock::now(); + uint64_t observed_generation = 0; + + std::unique_lock lock(s_checkpoint_mutex); + + while (!s_checkpoint_shutdown) + { + if (!s_checkpoint_enabled || s_checkpoint_timeout_seconds <= 0) + { + s_checkpoint_cv.wait(lock, []() + { + return s_checkpoint_shutdown || (s_checkpoint_enabled && s_checkpoint_timeout_seconds > 0); + }); + + last_progress = s_checkpoint_progress.load(std::memory_order_relaxed); + last_change = std::chrono::steady_clock::now(); + observed_generation = s_checkpoint_generation; + continue; + } + + const int timeout_seconds = s_checkpoint_timeout_seconds; + const uint64_t generation = s_checkpoint_generation; + const auto poll_interval = std::chrono::seconds(1); + + const bool reconfigured = s_checkpoint_cv.wait_for(lock, poll_interval, [generation]() + { + return s_checkpoint_shutdown || s_checkpoint_generation != generation; + }); + + if (s_checkpoint_shutdown) + break; + + if (reconfigured || observed_generation != s_checkpoint_generation) + { + last_progress = s_checkpoint_progress.load(std::memory_order_relaxed); + last_change = std::chrono::steady_clock::now(); + observed_generation = s_checkpoint_generation; + continue; + } + + const uint64_t current_progress = s_checkpoint_progress.load(std::memory_order_relaxed); + const auto now = std::chrono::steady_clock::now(); + + if (current_progress != last_progress) + { + last_progress = current_progress; + last_change = now; + continue; + } + + if (now - last_change >= std::chrono::seconds(timeout_seconds)) + { + lock.unlock(); + sys_err("CHECKPOINT shutdown: no progress observed for %d seconds.", timeout_seconds); + abort(); + } + } +} + +void checkpoint_ensure_started() +{ + std::lock_guard lock(s_checkpoint_mutex); + + if (s_checkpoint_thread.joinable()) + return; + + s_checkpoint_shutdown = false; + s_checkpoint_enabled = false; + s_checkpoint_timeout_seconds = 0; + s_checkpoint_generation = 0; + s_checkpoint_thread = std::thread(checkpoint_watchdog_loop); +} +#endif const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend) { @@ -14,6 +104,8 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend) return "none"; case CHECKPOINT_BACKEND_VIRTUAL_TIMER: return "virtual-timer"; + case CHECKPOINT_BACKEND_WATCHDOG_THREAD: + return "watchdog-thread"; default: return "unknown"; } @@ -22,6 +114,7 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend) #ifdef OS_WINDOWS void signal_setup() {} +void signal_destroy() {} void signal_timer_disable() {} void signal_timer_enable(int timeout_seconds) {} void signal_mark_progress() {} @@ -37,18 +130,6 @@ RETSIGTYPE reap(int sig) } -RETSIGTYPE checkpointing(int sig) -{ - if (!s_checkpoint_ticks.load()) - { - sys_err("CHECKPOINT shutdown: tics did not updated."); - abort(); - } - else - s_checkpoint_ticks.store(0); -} - - RETSIGTYPE hupsig(int sig) { shutdowned = TRUE; @@ -62,56 +143,79 @@ RETSIGTYPE usrsig(int sig) void signal_timer_disable(void) { - struct itimerval itime; - struct timeval interval; + checkpoint_ensure_started(); - interval.tv_sec = 0; - interval.tv_usec = 0; + { + std::lock_guard lock(s_checkpoint_mutex); + s_checkpoint_enabled = false; + s_checkpoint_timeout_seconds = 0; + ++s_checkpoint_generation; + } - itime.it_interval = interval; - itime.it_value = interval; - - setitimer(ITIMER_VIRTUAL, &itime, NULL); + s_checkpoint_cv.notify_all(); } void signal_timer_enable(int sec) { - struct itimerval itime; - struct timeval interval; + checkpoint_ensure_started(); - interval.tv_sec = sec; - interval.tv_usec = 0; + { + std::lock_guard lock(s_checkpoint_mutex); + s_checkpoint_enabled = sec > 0; + s_checkpoint_timeout_seconds = sec; + ++s_checkpoint_generation; + } - itime.it_interval = interval; - itime.it_value = interval; - - setitimer(ITIMER_VIRTUAL, &itime, NULL); + s_checkpoint_cv.notify_all(); } void signal_setup(void) { - signal_timer_enable(30); + checkpoint_ensure_started(); + signal_timer_enable(30); - signal(SIGVTALRM, checkpointing); + /* just to be on the safe side: */ + signal(SIGHUP, hupsig); + signal(SIGCHLD, reap); + signal(SIGINT, hupsig); + signal(SIGTERM, hupsig); + signal(SIGPIPE, SIG_IGN); + signal(SIGALRM, SIG_IGN); + signal(SIGUSR1, usrsig); - /* just to be on the safe side: */ - signal(SIGHUP, hupsig); - signal(SIGCHLD, reap); - signal(SIGINT, hupsig); - signal(SIGTERM, hupsig); - signal(SIGPIPE, SIG_IGN); - signal(SIGALRM, SIG_IGN); - signal(SIGUSR1, usrsig); + sys_log(0, "[STARTUP] checkpoint backend=%s", signal_checkpoint_backend_name(signal_checkpoint_backend())); +} + +void signal_destroy() +{ + std::thread checkpoint_thread; + + { + std::lock_guard lock(s_checkpoint_mutex); + + if (!s_checkpoint_thread.joinable()) + return; + + s_checkpoint_shutdown = true; + s_checkpoint_enabled = false; + s_checkpoint_timeout_seconds = 0; + ++s_checkpoint_generation; + checkpoint_thread = std::move(s_checkpoint_thread); + } + + s_checkpoint_cv.notify_all(); + checkpoint_thread.join(); + s_checkpoint_progress.store(0, std::memory_order_relaxed); } void signal_mark_progress() { - s_checkpoint_ticks.fetch_add(1, std::memory_order_relaxed); + s_checkpoint_progress.fetch_add(1, std::memory_order_relaxed); } ECheckpointBackend signal_checkpoint_backend() { - return CHECKPOINT_BACKEND_VIRTUAL_TIMER; + return CHECKPOINT_BACKEND_WATCHDOG_THREAD; } const char* signal_checkpoint_backend_name(ECheckpointBackend backend) diff --git a/src/libthecore/signal.h b/src/libthecore/signal.h index b64a627..53cbdf3 100644 --- a/src/libthecore/signal.h +++ b/src/libthecore/signal.h @@ -4,9 +4,11 @@ enum ECheckpointBackend { CHECKPOINT_BACKEND_NONE = 0, CHECKPOINT_BACKEND_VIRTUAL_TIMER = 1, + CHECKPOINT_BACKEND_WATCHDOG_THREAD = 2, }; void signal_setup(); +void signal_destroy(); void signal_timer_disable(); void signal_timer_enable(int timeout_seconds); void signal_mark_progress(); diff --git a/tests/smoke_auth.cpp b/tests/smoke_auth.cpp index 55ee502..7d56048 100644 --- a/tests/smoke_auth.cpp +++ b/tests/smoke_auth.cpp @@ -317,8 +317,8 @@ void TestCheckpointBackendMetadata() Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "none") == 0, "Unexpected checkpoint backend name on Windows"); #else - Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_VIRTUAL_TIMER, "Expected virtual timer checkpoint backend"); - Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "virtual-timer") == 0, + Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_WATCHDOG_THREAD, "Expected watchdog thread checkpoint backend"); + Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "watchdog-thread") == 0, "Unexpected checkpoint backend name"); #endif }