ops: add core backtrace helper

2026-04-14 17:01:50 +02:00
parent c5bb515781
commit 0bc6559283
5 changed files with 245 additions and 0 deletions
--- a/deploy/systemd/README.md
+++ b/deploy/systemd/README.md
@@ -35,6 +35,7 @@ The channel selection and port layout now come from the versioned inventory file
 - `/usr/local/libexec/metin-wait-port`
 - `/usr/local/bin/metinctl`
 - `/usr/local/sbin/metin-collect-incident`
 - `/usr/local/sbin/metin-core-backtrace`
 The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
--- a/deploy/systemd/bin/metin-core-backtrace.in
+++ b/deploy/systemd/bin/metin-core-backtrace.in
@@ -0,0 +1,197 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import os
 import re
 import shutil
 import subprocess
 from pathlib import Path
 RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate a backtrace for a Metin runtime core file")
    parser.add_argument("--core", help="Core file path. Defaults to the newest core under the runtime tree.")
    parser.add_argument("--exe", help="Executable path override. If omitted, infer it from the core path.")
    return parser.parse_args()
 def ensure_root() -> None:
    if os.geteuid() != 0:
        raise SystemExit("Run as root.")
 def run(command: list[str], check: bool = False) -> subprocess.CompletedProcess[str]:
    return subprocess.run(command, check=check, capture_output=True, text=True)
 def iter_core_files() -> list[Path]:
    return sorted(
        (path for path in RUNTIME_ROOT.glob("channels/**/core*") if path.is_file()),
        key=lambda path: path.stat().st_mtime,
        reverse=True,
    )
 def resolve_core_path(core_arg: str | None) -> Path:
    if core_arg:
        candidate = Path(core_arg)
        if not candidate.is_absolute():
            runtime_relative = RUNTIME_ROOT / core_arg
            if runtime_relative.exists():
                candidate = runtime_relative
        candidate = candidate.resolve()
        if not candidate.is_file():
            raise SystemExit(f"Core file not found: {candidate}")
        return candidate
    cores = iter_core_files()
    if not cores:
        raise SystemExit(f"No core files found under {RUNTIME_ROOT}")
    return cores[0]
 def infer_execfn_from_file_output(core_path: Path) -> Path | None:
    completed = run(["file", str(core_path)])
    if completed.returncode != 0:
        return None
    match = re.search(r"execfn: '([^']+)'", completed.stdout)
    if not match:
        return None
    candidate = Path(match.group(1))
    if candidate.is_file():
        return candidate
    return None
 def infer_executable(core_path: Path, exe_arg: str | None) -> Path:
    if exe_arg:
        exe_path = Path(exe_arg).resolve()
        if not exe_path.is_file():
            raise SystemExit(f"Executable not found: {exe_path}")
        return exe_path
    parent_name = core_path.parent.name
    grandparent_name = core_path.parent.parent.name if core_path.parent.parent else ""
    candidates: list[Path] = []
    if parent_name == "db":
        candidates.append(core_path.parent / "db")
    elif parent_name == "auth":
        candidates.append(core_path.parent / "game_auth")
    elif parent_name.startswith("core") and grandparent_name.startswith("channel"):
        candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}")
    execfn_candidate = infer_execfn_from_file_output(core_path)
    if execfn_candidate:
        candidates.append(execfn_candidate)
    for candidate in candidates:
        if candidate.is_file():
            return candidate.resolve()
    raise SystemExit(f"Could not infer executable for core file: {core_path}")
 def preferred_debugger() -> str | None:
    for tool in ("gdb", "lldb"):
        if shutil.which(tool):
            return tool
    return None
 def format_section(title: str, body: str) -> str:
    return f"== {title} ==\n{body.rstrip()}\n"
 def render_file_info(path: Path) -> str:
    completed = run(["file", str(path)])
    body = completed.stdout or completed.stderr or "<no output>"
    return format_section(f"file {path}", body)
 def render_readelf_notes(core_path: Path) -> str:
    if not shutil.which("readelf"):
        return ""
    completed = run(["readelf", "-n", str(core_path)])
    body = completed.stdout or completed.stderr or "<no output>"
    return format_section(f"readelf -n {core_path}", body)
 def render_debugger_backtrace(debugger: str, exe_path: Path, core_path: Path) -> str:
    if debugger == "gdb":
        command = [
            "gdb",
            "-batch",
            "-ex",
            "set pagination off",
            "-ex",
            "thread apply all bt full",
            str(exe_path),
            str(core_path),
        ]
    elif debugger == "lldb":
        command = [
            "lldb",
            "--batch",
            "-o",
            "thread backtrace all",
            "-c",
            str(core_path),
            str(exe_path),
        ]
    else:
        raise SystemExit(f"Unsupported debugger: {debugger}")
    completed = run(command)
    output = completed.stdout or completed.stderr or "<no output>"
    return format_section("backtrace", f"$ {' '.join(command)}\n\n{output}")
 def main() -> int:
    args = parse_args()
    ensure_root()
    core_path = resolve_core_path(args.core)
    exe_path = infer_executable(core_path, args.exe)
    debugger = preferred_debugger()
    sections = [
        format_section(
            "summary",
            "\n".join(
                [
                    f"core: {core_path}",
                    f"executable: {exe_path}",
                    f"debugger: {debugger or '<none>'}",
                ]
            ),
        ),
        render_file_info(core_path),
        render_file_info(exe_path),
    ]
    readelf_section = render_readelf_notes(core_path)
    if readelf_section:
        sections.append(readelf_section)
    if debugger:
        sections.append(render_debugger_backtrace(debugger, exe_path, core_path))
    else:
        sections.append(
            format_section(
                "backtrace",
                "No supported debugger found. Install gdb or lldb on the host to generate a stack trace.",
            )
        )
    print("\n".join(section.rstrip() for section in sections if section).rstrip())
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/deploy/systemd/bin/metinctl.in
+++ b/deploy/systemd/bin/metinctl.in
@@ -18,6 +18,7 @@ REPO_ROOT = Path("{{REPO_ROOT}}")
 RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
 HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
 INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
 CORE_BACKTRACE_PATH = Path("/usr/local/sbin/metin-core-backtrace")
 INCIDENT_ROOT = Path("/var/lib/metin/incidents")
 AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
 SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
@@ -116,6 +117,10 @@ def parse_args() -> argparse.Namespace:
    incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
    incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
    backtrace = subparsers.add_parser("backtrace", help="Generate a backtrace for the newest or selected core file")
    backtrace.add_argument("--core", help="Core file path. Defaults to the newest core in the runtime tree.")
    backtrace.add_argument("--exe", help="Executable path override.")
    auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
    auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
    auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
@@ -1194,6 +1199,19 @@ def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
    return 0
 def run_backtrace(core: str | None, exe: str | None) -> int:
    if not CORE_BACKTRACE_PATH.exists():
        raise SystemExit(f"Missing core backtrace helper: {CORE_BACKTRACE_PATH}")
    command = [str(CORE_BACKTRACE_PATH)]
    if core:
        command.extend(["--core", core])
    if exe:
        command.extend(["--exe", exe])
    run(command, require_root=True)
    return 0
 def main() -> int:
    args = parse_args()
@@ -1231,6 +1249,8 @@ def main() -> int:
        return run_logs(args.target, args.lines, args.follow)
    if args.command == "incident-collect":
        return run_incident_collect(args.tag, args.since, args.include_cores)
    if args.command == "backtrace":
        return run_backtrace(args.core, args.exe)
    if args.command == "healthcheck":
        return run_healthcheck(args.mode)
    if args.command == "wait-ready":
--- a/deploy/systemd/install_systemd.py
+++ b/deploy/systemd/install_systemd.py
@@ -149,6 +149,11 @@ def main() -> int:
        render_template(BIN_DIR / "metin-collect-incident.in", template_values),
        0o700,
    )
    write_text(
        sbin_dir / "metin-core-backtrace",
        render_template(BIN_DIR / "metin-core-backtrace.in", template_values),
        0o700,
    )
    copy_file(
        HEALTHCHECK_DIR / "metin-login-healthcheck.sh",
        sbin_dir / "metin-login-healthcheck",
--- a/docs/server-management.md
+++ b/docs/server-management.md
@@ -47,6 +47,7 @@ The Debian deployment installs:
 - restarting the whole stack or specific channels/instances
 - viewing logs
 - listing core files in the runtime tree
 - generating a backtrace for the newest or selected core file
 - collecting incident bundles
 - running the root-only headless healthcheck
 - waiting for login-ready state after restart
@@ -185,6 +186,18 @@ List core files currently present in the runtime tree:
 metinctl cores
 ```
 Generate a backtrace for the newest core file:
 ```bash
 metinctl backtrace
 ```
 Generate a backtrace for one specific core file:
 ```bash
 metinctl backtrace --core channels/channel1/core1/core.2255450
 ```
 Collect an incident bundle with logs, unit status, port state and repository revisions:
 ```bash
@@ -214,6 +227,7 @@ This makes channel enablement declarative instead of depending on whatever happe
 The Debian deployment now also installs:
 - `/usr/local/sbin/metin-collect-incident`
 - `/usr/local/sbin/metin-core-backtrace`
 The collector creates a timestamped bundle under:
@@ -231,3 +245,11 @@ Each bundle contains:
 If you call it with `--include-cores`, matching core files are copied into the bundle as well.
 The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
 For quick manual crash triage outside the incident bundle flow, use:
 ```bash
 metinctl backtrace
 ```
 It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable.