From 0bc6559283bfd92e5ad73f814447637b4e62af5e Mon Sep 17 00:00:00 2001 From: server Date: Tue, 14 Apr 2026 17:01:50 +0200 Subject: [PATCH] ops: add core backtrace helper --- deploy/systemd/README.md | 1 + deploy/systemd/bin/metin-core-backtrace.in | 197 +++++++++++++++++++++ deploy/systemd/bin/metinctl.in | 20 +++ deploy/systemd/install_systemd.py | 5 + docs/server-management.md | 22 +++ 5 files changed, 245 insertions(+) create mode 100644 deploy/systemd/bin/metin-core-backtrace.in diff --git a/deploy/systemd/README.md b/deploy/systemd/README.md index 7bce2e0..2a1c03a 100644 --- a/deploy/systemd/README.md +++ b/deploy/systemd/README.md @@ -35,6 +35,7 @@ The channel selection and port layout now come from the versioned inventory file - `/usr/local/libexec/metin-wait-port` - `/usr/local/bin/metinctl` - `/usr/local/sbin/metin-collect-incident` +- `/usr/local/sbin/metin-core-backtrace` The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start. diff --git a/deploy/systemd/bin/metin-core-backtrace.in b/deploy/systemd/bin/metin-core-backtrace.in new file mode 100644 index 0000000..d200fe5 --- /dev/null +++ b/deploy/systemd/bin/metin-core-backtrace.in @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import os +import re +import shutil +import subprocess +from pathlib import Path + +RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate a backtrace for a Metin runtime core file") + parser.add_argument("--core", help="Core file path. Defaults to the newest core under the runtime tree.") + parser.add_argument("--exe", help="Executable path override. If omitted, infer it from the core path.") + return parser.parse_args() + + +def ensure_root() -> None: + if os.geteuid() != 0: + raise SystemExit("Run as root.") + + +def run(command: list[str], check: bool = False) -> subprocess.CompletedProcess[str]: + return subprocess.run(command, check=check, capture_output=True, text=True) + + +def iter_core_files() -> list[Path]: + return sorted( + (path for path in RUNTIME_ROOT.glob("channels/**/core*") if path.is_file()), + key=lambda path: path.stat().st_mtime, + reverse=True, + ) + + +def resolve_core_path(core_arg: str | None) -> Path: + if core_arg: + candidate = Path(core_arg) + if not candidate.is_absolute(): + runtime_relative = RUNTIME_ROOT / core_arg + if runtime_relative.exists(): + candidate = runtime_relative + candidate = candidate.resolve() + if not candidate.is_file(): + raise SystemExit(f"Core file not found: {candidate}") + return candidate + + cores = iter_core_files() + if not cores: + raise SystemExit(f"No core files found under {RUNTIME_ROOT}") + return cores[0] + + +def infer_execfn_from_file_output(core_path: Path) -> Path | None: + completed = run(["file", str(core_path)]) + if completed.returncode != 0: + return None + + match = re.search(r"execfn: '([^']+)'", completed.stdout) + if not match: + return None + + candidate = Path(match.group(1)) + if candidate.is_file(): + return candidate + return None + + +def infer_executable(core_path: Path, exe_arg: str | None) -> Path: + if exe_arg: + exe_path = Path(exe_arg).resolve() + if not exe_path.is_file(): + raise SystemExit(f"Executable not found: {exe_path}") + return exe_path + + parent_name = core_path.parent.name + grandparent_name = core_path.parent.parent.name if core_path.parent.parent else "" + + candidates: list[Path] = [] + if parent_name == "db": + candidates.append(core_path.parent / "db") + elif parent_name == "auth": + candidates.append(core_path.parent / "game_auth") + elif parent_name.startswith("core") and grandparent_name.startswith("channel"): + candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}") + + execfn_candidate = infer_execfn_from_file_output(core_path) + if execfn_candidate: + candidates.append(execfn_candidate) + + for candidate in candidates: + if candidate.is_file(): + return candidate.resolve() + + raise SystemExit(f"Could not infer executable for core file: {core_path}") + + +def preferred_debugger() -> str | None: + for tool in ("gdb", "lldb"): + if shutil.which(tool): + return tool + return None + + +def format_section(title: str, body: str) -> str: + return f"== {title} ==\n{body.rstrip()}\n" + + +def render_file_info(path: Path) -> str: + completed = run(["file", str(path)]) + body = completed.stdout or completed.stderr or "" + return format_section(f"file {path}", body) + + +def render_readelf_notes(core_path: Path) -> str: + if not shutil.which("readelf"): + return "" + completed = run(["readelf", "-n", str(core_path)]) + body = completed.stdout or completed.stderr or "" + return format_section(f"readelf -n {core_path}", body) + + +def render_debugger_backtrace(debugger: str, exe_path: Path, core_path: Path) -> str: + if debugger == "gdb": + command = [ + "gdb", + "-batch", + "-ex", + "set pagination off", + "-ex", + "thread apply all bt full", + str(exe_path), + str(core_path), + ] + elif debugger == "lldb": + command = [ + "lldb", + "--batch", + "-o", + "thread backtrace all", + "-c", + str(core_path), + str(exe_path), + ] + else: + raise SystemExit(f"Unsupported debugger: {debugger}") + + completed = run(command) + output = completed.stdout or completed.stderr or "" + return format_section("backtrace", f"$ {' '.join(command)}\n\n{output}") + + +def main() -> int: + args = parse_args() + ensure_root() + + core_path = resolve_core_path(args.core) + exe_path = infer_executable(core_path, args.exe) + debugger = preferred_debugger() + + sections = [ + format_section( + "summary", + "\n".join( + [ + f"core: {core_path}", + f"executable: {exe_path}", + f"debugger: {debugger or ''}", + ] + ), + ), + render_file_info(core_path), + render_file_info(exe_path), + ] + + readelf_section = render_readelf_notes(core_path) + if readelf_section: + sections.append(readelf_section) + + if debugger: + sections.append(render_debugger_backtrace(debugger, exe_path, core_path)) + else: + sections.append( + format_section( + "backtrace", + "No supported debugger found. Install gdb or lldb on the host to generate a stack trace.", + ) + ) + + print("\n".join(section.rstrip() for section in sections if section).rstrip()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/deploy/systemd/bin/metinctl.in b/deploy/systemd/bin/metinctl.in index 0628814..b9e6df8 100644 --- a/deploy/systemd/bin/metinctl.in +++ b/deploy/systemd/bin/metinctl.in @@ -18,6 +18,7 @@ REPO_ROOT = Path("{{REPO_ROOT}}") RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck") INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident") +CORE_BACKTRACE_PATH = Path("/usr/local/sbin/metin-core-backtrace") INCIDENT_ROOT = Path("/var/lib/metin/incidents") AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log" SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src" @@ -116,6 +117,10 @@ def parse_args() -> argparse.Namespace: incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value") incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle") + backtrace = subparsers.add_parser("backtrace", help="Generate a backtrace for the newest or selected core file") + backtrace.add_argument("--core", help="Core file path. Defaults to the newest core in the runtime tree.") + backtrace.add_argument("--exe", help="Executable path override.") + auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog") auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect") auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show") @@ -1194,6 +1199,19 @@ def run_incident_collect(tag: str, since: str, include_cores: bool) -> int: return 0 +def run_backtrace(core: str | None, exe: str | None) -> int: + if not CORE_BACKTRACE_PATH.exists(): + raise SystemExit(f"Missing core backtrace helper: {CORE_BACKTRACE_PATH}") + + command = [str(CORE_BACKTRACE_PATH)] + if core: + command.extend(["--core", core]) + if exe: + command.extend(["--exe", exe]) + run(command, require_root=True) + return 0 + + def main() -> int: args = parse_args() @@ -1231,6 +1249,8 @@ def main() -> int: return run_logs(args.target, args.lines, args.follow) if args.command == "incident-collect": return run_incident_collect(args.tag, args.since, args.include_cores) + if args.command == "backtrace": + return run_backtrace(args.core, args.exe) if args.command == "healthcheck": return run_healthcheck(args.mode) if args.command == "wait-ready": diff --git a/deploy/systemd/install_systemd.py b/deploy/systemd/install_systemd.py index c18f0a4..1ef2205 100644 --- a/deploy/systemd/install_systemd.py +++ b/deploy/systemd/install_systemd.py @@ -149,6 +149,11 @@ def main() -> int: render_template(BIN_DIR / "metin-collect-incident.in", template_values), 0o700, ) + write_text( + sbin_dir / "metin-core-backtrace", + render_template(BIN_DIR / "metin-core-backtrace.in", template_values), + 0o700, + ) copy_file( HEALTHCHECK_DIR / "metin-login-healthcheck.sh", sbin_dir / "metin-login-healthcheck", diff --git a/docs/server-management.md b/docs/server-management.md index b23cb83..88132f7 100644 --- a/docs/server-management.md +++ b/docs/server-management.md @@ -47,6 +47,7 @@ The Debian deployment installs: - restarting the whole stack or specific channels/instances - viewing logs - listing core files in the runtime tree +- generating a backtrace for the newest or selected core file - collecting incident bundles - running the root-only headless healthcheck - waiting for login-ready state after restart @@ -185,6 +186,18 @@ List core files currently present in the runtime tree: metinctl cores ``` +Generate a backtrace for the newest core file: + +```bash +metinctl backtrace +``` + +Generate a backtrace for one specific core file: + +```bash +metinctl backtrace --core channels/channel1/core1/core.2255450 +``` + Collect an incident bundle with logs, unit status, port state and repository revisions: ```bash @@ -214,6 +227,7 @@ This makes channel enablement declarative instead of depending on whatever happe The Debian deployment now also installs: - `/usr/local/sbin/metin-collect-incident` +- `/usr/local/sbin/metin-core-backtrace` The collector creates a timestamped bundle under: @@ -231,3 +245,11 @@ Each bundle contains: If you call it with `--include-cores`, matching core files are copied into the bundle as well. The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it. + +For quick manual crash triage outside the incident bundle flow, use: + +```bash +metinctl backtrace +``` + +It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable.