From 6f16f66543f05ccc6dd22886e8e5c2c2053affd8 Mon Sep 17 00:00:00 2001 From: server Date: Tue, 14 Apr 2026 17:05:11 +0200 Subject: [PATCH] ops: snapshot crash executables --- deploy/systemd/bin/metin-collect-incident.in | 75 ++++++++++++++++++++ deploy/systemd/bin/metin-core-backtrace.in | 33 +++++++-- docs/server-management.md | 5 +- 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/deploy/systemd/bin/metin-collect-incident.in b/deploy/systemd/bin/metin-collect-incident.in index 17f6ee3..c102b63 100644 --- a/deploy/systemd/bin/metin-collect-incident.in +++ b/deploy/systemd/bin/metin-collect-incident.in @@ -4,6 +4,7 @@ from __future__ import annotations import argparse import json import os +import re import shutil import socket import subprocess @@ -106,6 +107,78 @@ def copy_core_files(bundle_dir: Path, core_files: list[Path]) -> None: shutil.copy2(path, destination) +def infer_execfn_from_file_output(core_path: Path) -> Path | None: + completed = run(["file", str(core_path)], check=False) + if completed.returncode != 0: + return None + + match = re.search(r"execfn: '([^']+)'", completed.stdout) + if not match: + return None + + candidate = Path(match.group(1)) + if candidate.exists(): + return candidate.resolve() + return None + + +def infer_executable_for_core(core_path: Path) -> Path | None: + execfn_candidate = infer_execfn_from_file_output(core_path) + if execfn_candidate: + return execfn_candidate + + parent_name = core_path.parent.name + grandparent_name = core_path.parent.parent.name if core_path.parent.parent else "" + + if parent_name == "db": + candidate = (core_path.parent / "db").resolve() + return candidate if candidate.is_file() else None + if parent_name == "auth": + candidate = (core_path.parent / "game_auth").resolve() + return candidate if candidate.is_file() else None + if parent_name.startswith("core") and grandparent_name.startswith("channel"): + candidate = (core_path.parent / f"{grandparent_name}_{parent_name}").resolve() + return candidate if candidate.is_file() else None + + return None + + +def executable_metadata(path: Path) -> dict[str, object]: + stat = path.stat() + return { + "path": str(path), + "size_bytes": stat.st_size, + "mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(), + } + + +def write_core_executable_metadata(bundle_dir: Path, core_files: list[Path]) -> None: + rows = [] + for core_path in core_files: + row: dict[str, object] = {"core": str(core_path)} + executable = infer_executable_for_core(core_path) + if executable: + row["executable"] = executable_metadata(executable) + else: + row["executable"] = None + rows.append(row) + write_text(bundle_dir / "core-executables.json", json.dumps(rows, indent=2)) + + +def copy_core_executables(bundle_dir: Path, core_files: list[Path]) -> None: + executables_dir = bundle_dir / "executables" + copied: set[Path] = set() + for core_path in core_files: + executable = infer_executable_for_core(core_path) + if not executable or executable in copied: + continue + copied.add(executable) + relative = executable.relative_to(RUNTIME_ROOT) + destination = executables_dir / relative + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(executable, destination) + + def git_summary(repo_path: Path) -> dict[str, object]: summary: dict[str, object] = {"path": str(repo_path), "present": repo_path.exists()} if not repo_path.exists(): @@ -180,8 +253,10 @@ def main() -> int: core_files = find_core_files() write_core_metadata(bundle_dir, core_files) + write_core_executable_metadata(bundle_dir, core_files) if args.include_cores and core_files: copy_core_files(bundle_dir, core_files) + copy_core_executables(bundle_dir, core_files) print(bundle_dir) return 0 diff --git a/deploy/systemd/bin/metin-core-backtrace.in b/deploy/systemd/bin/metin-core-backtrace.in index d200fe5..3c4cdfa 100644 --- a/deploy/systemd/bin/metin-core-backtrace.in +++ b/deploy/systemd/bin/metin-core-backtrace.in @@ -6,6 +6,7 @@ import os import re import shutil import subprocess +from datetime import datetime, timezone from pathlib import Path RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") @@ -75,10 +76,15 @@ def infer_executable(core_path: Path, exe_arg: str | None) -> Path: raise SystemExit(f"Executable not found: {exe_path}") return exe_path + execfn_candidate = infer_execfn_from_file_output(core_path) + parent_name = core_path.parent.name grandparent_name = core_path.parent.parent.name if core_path.parent.parent else "" candidates: list[Path] = [] + if execfn_candidate: + candidates.append(execfn_candidate) + if parent_name == "db": candidates.append(core_path.parent / "db") elif parent_name == "auth": @@ -86,10 +92,6 @@ def infer_executable(core_path: Path, exe_arg: str | None) -> Path: elif parent_name.startswith("core") and grandparent_name.startswith("channel"): candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}") - execfn_candidate = infer_execfn_from_file_output(core_path) - if execfn_candidate: - candidates.append(execfn_candidate) - for candidate in candidates: if candidate.is_file(): return candidate.resolve() @@ -114,6 +116,28 @@ def render_file_info(path: Path) -> str: return format_section(f"file {path}", body) +def render_executable_freshness(core_path: Path, exe_path: Path) -> str: + core_stat = core_path.stat() + exe_stat = exe_path.stat() + core_mtime = datetime.fromtimestamp(core_stat.st_mtime, tz=timezone.utc).isoformat() + exe_mtime = datetime.fromtimestamp(exe_stat.st_mtime, tz=timezone.utc).isoformat() + + lines = [ + f"core_mtime: {core_mtime}", + f"exe_mtime: {exe_mtime}", + ] + + if exe_stat.st_mtime > core_stat.st_mtime + 1: + lines.append( + "warning: executable is newer than the core file; symbols may not match. " + "Prefer an executable snapshot from an incident bundle or pass --exe explicitly." + ) + else: + lines.append("status: executable is not newer than the core file") + + return format_section("core/executable freshness", "\n".join(lines)) + + def render_readelf_notes(core_path: Path) -> str: if not shutil.which("readelf"): return "" @@ -173,6 +197,7 @@ def main() -> int: ), render_file_info(core_path), render_file_info(exe_path), + render_executable_freshness(core_path, exe_path), ] readelf_section = render_readelf_notes(core_path) diff --git a/docs/server-management.md b/docs/server-management.md index 88132f7..c0bb25b 100644 --- a/docs/server-management.md +++ b/docs/server-management.md @@ -241,8 +241,9 @@ Each bundle contains: - listener state from `ss -ltnp` - tailed runtime `syslog.log` and `syserr.log` files - metadata for any `core*` files found under `runtime/server/channels` +- metadata for the executable inferred for each core file -If you call it with `--include-cores`, matching core files are copied into the bundle as well. +If you call it with `--include-cores`, matching core files are copied into the bundle as well. In the same mode, the inferred executable files are copied too, so a later redeploy does not destroy your ability to symbolicate the crash with the original binary snapshot. The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it. @@ -252,4 +253,4 @@ For quick manual crash triage outside the incident bundle flow, use: metinctl backtrace ``` -It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable. +It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable. If the current executable is newer than the core file, the helper prints an explicit warning because the backtrace may no longer match the crashed binary.