runtime: replace virtual checkpoint timer with watchdog
This commit is contained in:
@@ -88,6 +88,7 @@ int thecore_idle(void)
|
|||||||
|
|
||||||
void thecore_destroy(void)
|
void thecore_destroy(void)
|
||||||
{
|
{
|
||||||
|
signal_destroy();
|
||||||
pid_deinit();
|
pid_deinit();
|
||||||
log_destroy();
|
log_destroy();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,100 @@
|
|||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <chrono>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <mutex>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
std::atomic<int> s_checkpoint_ticks { 0 };
|
std::atomic<uint64_t> s_checkpoint_progress { 0 };
|
||||||
|
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
std::mutex s_checkpoint_mutex;
|
||||||
|
std::condition_variable s_checkpoint_cv;
|
||||||
|
std::thread s_checkpoint_thread;
|
||||||
|
bool s_checkpoint_shutdown = false;
|
||||||
|
bool s_checkpoint_enabled = false;
|
||||||
|
int s_checkpoint_timeout_seconds = 0;
|
||||||
|
uint64_t s_checkpoint_generation = 0;
|
||||||
|
|
||||||
|
void checkpoint_watchdog_loop()
|
||||||
|
{
|
||||||
|
uint64_t last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
|
||||||
|
auto last_change = std::chrono::steady_clock::now();
|
||||||
|
uint64_t observed_generation = 0;
|
||||||
|
|
||||||
|
std::unique_lock<std::mutex> lock(s_checkpoint_mutex);
|
||||||
|
|
||||||
|
while (!s_checkpoint_shutdown)
|
||||||
|
{
|
||||||
|
if (!s_checkpoint_enabled || s_checkpoint_timeout_seconds <= 0)
|
||||||
|
{
|
||||||
|
s_checkpoint_cv.wait(lock, []()
|
||||||
|
{
|
||||||
|
return s_checkpoint_shutdown || (s_checkpoint_enabled && s_checkpoint_timeout_seconds > 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
|
||||||
|
last_change = std::chrono::steady_clock::now();
|
||||||
|
observed_generation = s_checkpoint_generation;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int timeout_seconds = s_checkpoint_timeout_seconds;
|
||||||
|
const uint64_t generation = s_checkpoint_generation;
|
||||||
|
const auto poll_interval = std::chrono::seconds(1);
|
||||||
|
|
||||||
|
const bool reconfigured = s_checkpoint_cv.wait_for(lock, poll_interval, [generation]()
|
||||||
|
{
|
||||||
|
return s_checkpoint_shutdown || s_checkpoint_generation != generation;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (s_checkpoint_shutdown)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (reconfigured || observed_generation != s_checkpoint_generation)
|
||||||
|
{
|
||||||
|
last_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
|
||||||
|
last_change = std::chrono::steady_clock::now();
|
||||||
|
observed_generation = s_checkpoint_generation;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint64_t current_progress = s_checkpoint_progress.load(std::memory_order_relaxed);
|
||||||
|
const auto now = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
if (current_progress != last_progress)
|
||||||
|
{
|
||||||
|
last_progress = current_progress;
|
||||||
|
last_change = now;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (now - last_change >= std::chrono::seconds(timeout_seconds))
|
||||||
|
{
|
||||||
|
lock.unlock();
|
||||||
|
sys_err("CHECKPOINT shutdown: no progress observed for %d seconds.", timeout_seconds);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkpoint_ensure_started()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
|
||||||
|
|
||||||
|
if (s_checkpoint_thread.joinable())
|
||||||
|
return;
|
||||||
|
|
||||||
|
s_checkpoint_shutdown = false;
|
||||||
|
s_checkpoint_enabled = false;
|
||||||
|
s_checkpoint_timeout_seconds = 0;
|
||||||
|
s_checkpoint_generation = 0;
|
||||||
|
s_checkpoint_thread = std::thread(checkpoint_watchdog_loop);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
|
const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
|
||||||
{
|
{
|
||||||
@@ -14,6 +104,8 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
|
|||||||
return "none";
|
return "none";
|
||||||
case CHECKPOINT_BACKEND_VIRTUAL_TIMER:
|
case CHECKPOINT_BACKEND_VIRTUAL_TIMER:
|
||||||
return "virtual-timer";
|
return "virtual-timer";
|
||||||
|
case CHECKPOINT_BACKEND_WATCHDOG_THREAD:
|
||||||
|
return "watchdog-thread";
|
||||||
default:
|
default:
|
||||||
return "unknown";
|
return "unknown";
|
||||||
}
|
}
|
||||||
@@ -22,6 +114,7 @@ const char* signal_checkpoint_backend_name_impl(ECheckpointBackend backend)
|
|||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
void signal_setup() {}
|
void signal_setup() {}
|
||||||
|
void signal_destroy() {}
|
||||||
void signal_timer_disable() {}
|
void signal_timer_disable() {}
|
||||||
void signal_timer_enable(int timeout_seconds) {}
|
void signal_timer_enable(int timeout_seconds) {}
|
||||||
void signal_mark_progress() {}
|
void signal_mark_progress() {}
|
||||||
@@ -37,18 +130,6 @@ RETSIGTYPE reap(int sig)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
RETSIGTYPE checkpointing(int sig)
|
|
||||||
{
|
|
||||||
if (!s_checkpoint_ticks.load())
|
|
||||||
{
|
|
||||||
sys_err("CHECKPOINT shutdown: tics did not updated.");
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
s_checkpoint_ticks.store(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
RETSIGTYPE hupsig(int sig)
|
RETSIGTYPE hupsig(int sig)
|
||||||
{
|
{
|
||||||
shutdowned = TRUE;
|
shutdowned = TRUE;
|
||||||
@@ -62,56 +143,79 @@ RETSIGTYPE usrsig(int sig)
|
|||||||
|
|
||||||
void signal_timer_disable(void)
|
void signal_timer_disable(void)
|
||||||
{
|
{
|
||||||
struct itimerval itime;
|
checkpoint_ensure_started();
|
||||||
struct timeval interval;
|
|
||||||
|
|
||||||
interval.tv_sec = 0;
|
{
|
||||||
interval.tv_usec = 0;
|
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
|
||||||
|
s_checkpoint_enabled = false;
|
||||||
|
s_checkpoint_timeout_seconds = 0;
|
||||||
|
++s_checkpoint_generation;
|
||||||
|
}
|
||||||
|
|
||||||
itime.it_interval = interval;
|
s_checkpoint_cv.notify_all();
|
||||||
itime.it_value = interval;
|
|
||||||
|
|
||||||
setitimer(ITIMER_VIRTUAL, &itime, NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void signal_timer_enable(int sec)
|
void signal_timer_enable(int sec)
|
||||||
{
|
{
|
||||||
struct itimerval itime;
|
checkpoint_ensure_started();
|
||||||
struct timeval interval;
|
|
||||||
|
|
||||||
interval.tv_sec = sec;
|
{
|
||||||
interval.tv_usec = 0;
|
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
|
||||||
|
s_checkpoint_enabled = sec > 0;
|
||||||
|
s_checkpoint_timeout_seconds = sec;
|
||||||
|
++s_checkpoint_generation;
|
||||||
|
}
|
||||||
|
|
||||||
itime.it_interval = interval;
|
s_checkpoint_cv.notify_all();
|
||||||
itime.it_value = interval;
|
|
||||||
|
|
||||||
setitimer(ITIMER_VIRTUAL, &itime, NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void signal_setup(void)
|
void signal_setup(void)
|
||||||
{
|
{
|
||||||
signal_timer_enable(30);
|
checkpoint_ensure_started();
|
||||||
|
signal_timer_enable(30);
|
||||||
|
|
||||||
signal(SIGVTALRM, checkpointing);
|
/* just to be on the safe side: */
|
||||||
|
signal(SIGHUP, hupsig);
|
||||||
|
signal(SIGCHLD, reap);
|
||||||
|
signal(SIGINT, hupsig);
|
||||||
|
signal(SIGTERM, hupsig);
|
||||||
|
signal(SIGPIPE, SIG_IGN);
|
||||||
|
signal(SIGALRM, SIG_IGN);
|
||||||
|
signal(SIGUSR1, usrsig);
|
||||||
|
|
||||||
/* just to be on the safe side: */
|
sys_log(0, "[STARTUP] checkpoint backend=%s", signal_checkpoint_backend_name(signal_checkpoint_backend()));
|
||||||
signal(SIGHUP, hupsig);
|
}
|
||||||
signal(SIGCHLD, reap);
|
|
||||||
signal(SIGINT, hupsig);
|
void signal_destroy()
|
||||||
signal(SIGTERM, hupsig);
|
{
|
||||||
signal(SIGPIPE, SIG_IGN);
|
std::thread checkpoint_thread;
|
||||||
signal(SIGALRM, SIG_IGN);
|
|
||||||
signal(SIGUSR1, usrsig);
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(s_checkpoint_mutex);
|
||||||
|
|
||||||
|
if (!s_checkpoint_thread.joinable())
|
||||||
|
return;
|
||||||
|
|
||||||
|
s_checkpoint_shutdown = true;
|
||||||
|
s_checkpoint_enabled = false;
|
||||||
|
s_checkpoint_timeout_seconds = 0;
|
||||||
|
++s_checkpoint_generation;
|
||||||
|
checkpoint_thread = std::move(s_checkpoint_thread);
|
||||||
|
}
|
||||||
|
|
||||||
|
s_checkpoint_cv.notify_all();
|
||||||
|
checkpoint_thread.join();
|
||||||
|
s_checkpoint_progress.store(0, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
void signal_mark_progress()
|
void signal_mark_progress()
|
||||||
{
|
{
|
||||||
s_checkpoint_ticks.fetch_add(1, std::memory_order_relaxed);
|
s_checkpoint_progress.fetch_add(1, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
ECheckpointBackend signal_checkpoint_backend()
|
ECheckpointBackend signal_checkpoint_backend()
|
||||||
{
|
{
|
||||||
return CHECKPOINT_BACKEND_VIRTUAL_TIMER;
|
return CHECKPOINT_BACKEND_WATCHDOG_THREAD;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* signal_checkpoint_backend_name(ECheckpointBackend backend)
|
const char* signal_checkpoint_backend_name(ECheckpointBackend backend)
|
||||||
|
|||||||
@@ -4,9 +4,11 @@ enum ECheckpointBackend
|
|||||||
{
|
{
|
||||||
CHECKPOINT_BACKEND_NONE = 0,
|
CHECKPOINT_BACKEND_NONE = 0,
|
||||||
CHECKPOINT_BACKEND_VIRTUAL_TIMER = 1,
|
CHECKPOINT_BACKEND_VIRTUAL_TIMER = 1,
|
||||||
|
CHECKPOINT_BACKEND_WATCHDOG_THREAD = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
void signal_setup();
|
void signal_setup();
|
||||||
|
void signal_destroy();
|
||||||
void signal_timer_disable();
|
void signal_timer_disable();
|
||||||
void signal_timer_enable(int timeout_seconds);
|
void signal_timer_enable(int timeout_seconds);
|
||||||
void signal_mark_progress();
|
void signal_mark_progress();
|
||||||
|
|||||||
@@ -317,8 +317,8 @@ void TestCheckpointBackendMetadata()
|
|||||||
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "none") == 0,
|
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "none") == 0,
|
||||||
"Unexpected checkpoint backend name on Windows");
|
"Unexpected checkpoint backend name on Windows");
|
||||||
#else
|
#else
|
||||||
Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_VIRTUAL_TIMER, "Expected virtual timer checkpoint backend");
|
Expect(signal_checkpoint_backend() == CHECKPOINT_BACKEND_WATCHDOG_THREAD, "Expected watchdog thread checkpoint backend");
|
||||||
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "virtual-timer") == 0,
|
Expect(std::strcmp(signal_checkpoint_backend_name(signal_checkpoint_backend()), "watchdog-thread") == 0,
|
||||||
"Unexpected checkpoint backend name");
|
"Unexpected checkpoint backend name");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user