From 0bc6559283bfd92e5ad73f814447637b4e62af5e Mon Sep 17 00:00:00 2001
From: server <server@jakubkadlec.dev>
Date: Tue, 14 Apr 2026 17:01:50 +0200
Subject: [PATCH] ops: add core backtrace helper

---
 deploy/systemd/README.md                   |   1 +
 deploy/systemd/bin/metin-core-backtrace.in | 197 +++++++++++++++++++++
 deploy/systemd/bin/metinctl.in             |  20 +++
 deploy/systemd/install_systemd.py          |   5 +
 docs/server-management.md                  |  22 +++
 5 files changed, 245 insertions(+)
 create mode 100644 deploy/systemd/bin/metin-core-backtrace.in

diff --git a/deploy/systemd/README.md b/deploy/systemd/README.md
index 7bce2e0..2a1c03a 100644
--- a/deploy/systemd/README.md
+++ b/deploy/systemd/README.md
@@ -35,6 +35,7 @@ The channel selection and port layout now come from the versioned inventory file
 - `/usr/local/libexec/metin-wait-port`
 - `/usr/local/bin/metinctl`
 - `/usr/local/sbin/metin-collect-incident`
+- `/usr/local/sbin/metin-core-backtrace`
 
 The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
 
diff --git a/deploy/systemd/bin/metin-core-backtrace.in b/deploy/systemd/bin/metin-core-backtrace.in
new file mode 100644
index 0000000..d200fe5
--- /dev/null
+++ b/deploy/systemd/bin/metin-core-backtrace.in
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+
+RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate a backtrace for a Metin runtime core file")
+    parser.add_argument("--core", help="Core file path. Defaults to the newest core under the runtime tree.")
+    parser.add_argument("--exe", help="Executable path override. If omitted, infer it from the core path.")
+    return parser.parse_args()
+
+
+def ensure_root() -> None:
+    if os.geteuid() != 0:
+        raise SystemExit("Run as root.")
+
+
+def run(command: list[str], check: bool = False) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(command, check=check, capture_output=True, text=True)
+
+
+def iter_core_files() -> list[Path]:
+    return sorted(
+        (path for path in RUNTIME_ROOT.glob("channels/**/core*") if path.is_file()),
+        key=lambda path: path.stat().st_mtime,
+        reverse=True,
+    )
+
+
+def resolve_core_path(core_arg: str | None) -> Path:
+    if core_arg:
+        candidate = Path(core_arg)
+        if not candidate.is_absolute():
+            runtime_relative = RUNTIME_ROOT / core_arg
+            if runtime_relative.exists():
+                candidate = runtime_relative
+        candidate = candidate.resolve()
+        if not candidate.is_file():
+            raise SystemExit(f"Core file not found: {candidate}")
+        return candidate
+
+    cores = iter_core_files()
+    if not cores:
+        raise SystemExit(f"No core files found under {RUNTIME_ROOT}")
+    return cores[0]
+
+
+def infer_execfn_from_file_output(core_path: Path) -> Path | None:
+    completed = run(["file", str(core_path)])
+    if completed.returncode != 0:
+        return None
+
+    match = re.search(r"execfn: '([^']+)'", completed.stdout)
+    if not match:
+        return None
+
+    candidate = Path(match.group(1))
+    if candidate.is_file():
+        return candidate
+    return None
+
+
+def infer_executable(core_path: Path, exe_arg: str | None) -> Path:
+    if exe_arg:
+        exe_path = Path(exe_arg).resolve()
+        if not exe_path.is_file():
+            raise SystemExit(f"Executable not found: {exe_path}")
+        return exe_path
+
+    parent_name = core_path.parent.name
+    grandparent_name = core_path.parent.parent.name if core_path.parent.parent else ""
+
+    candidates: list[Path] = []
+    if parent_name == "db":
+        candidates.append(core_path.parent / "db")
+    elif parent_name == "auth":
+        candidates.append(core_path.parent / "game_auth")
+    elif parent_name.startswith("core") and grandparent_name.startswith("channel"):
+        candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}")
+
+    execfn_candidate = infer_execfn_from_file_output(core_path)
+    if execfn_candidate:
+        candidates.append(execfn_candidate)
+
+    for candidate in candidates:
+        if candidate.is_file():
+            return candidate.resolve()
+
+    raise SystemExit(f"Could not infer executable for core file: {core_path}")
+
+
+def preferred_debugger() -> str | None:
+    for tool in ("gdb", "lldb"):
+        if shutil.which(tool):
+            return tool
+    return None
+
+
+def format_section(title: str, body: str) -> str:
+    return f"== {title} ==\n{body.rstrip()}\n"
+
+
+def render_file_info(path: Path) -> str:
+    completed = run(["file", str(path)])
+    body = completed.stdout or completed.stderr or "<no output>"
+    return format_section(f"file {path}", body)
+
+
+def render_readelf_notes(core_path: Path) -> str:
+    if not shutil.which("readelf"):
+        return ""
+    completed = run(["readelf", "-n", str(core_path)])
+    body = completed.stdout or completed.stderr or "<no output>"
+    return format_section(f"readelf -n {core_path}", body)
+
+
+def render_debugger_backtrace(debugger: str, exe_path: Path, core_path: Path) -> str:
+    if debugger == "gdb":
+        command = [
+            "gdb",
+            "-batch",
+            "-ex",
+            "set pagination off",
+            "-ex",
+            "thread apply all bt full",
+            str(exe_path),
+            str(core_path),
+        ]
+    elif debugger == "lldb":
+        command = [
+            "lldb",
+            "--batch",
+            "-o",
+            "thread backtrace all",
+            "-c",
+            str(core_path),
+            str(exe_path),
+        ]
+    else:
+        raise SystemExit(f"Unsupported debugger: {debugger}")
+
+    completed = run(command)
+    output = completed.stdout or completed.stderr or "<no output>"
+    return format_section("backtrace", f"$ {' '.join(command)}\n\n{output}")
+
+
+def main() -> int:
+    args = parse_args()
+    ensure_root()
+
+    core_path = resolve_core_path(args.core)
+    exe_path = infer_executable(core_path, args.exe)
+    debugger = preferred_debugger()
+
+    sections = [
+        format_section(
+            "summary",
+            "\n".join(
+                [
+                    f"core: {core_path}",
+                    f"executable: {exe_path}",
+                    f"debugger: {debugger or '<none>'}",
+                ]
+            ),
+        ),
+        render_file_info(core_path),
+        render_file_info(exe_path),
+    ]
+
+    readelf_section = render_readelf_notes(core_path)
+    if readelf_section:
+        sections.append(readelf_section)
+
+    if debugger:
+        sections.append(render_debugger_backtrace(debugger, exe_path, core_path))
+    else:
+        sections.append(
+            format_section(
+                "backtrace",
+                "No supported debugger found. Install gdb or lldb on the host to generate a stack trace.",
+            )
+        )
+
+    print("\n".join(section.rstrip() for section in sections if section).rstrip())
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/deploy/systemd/bin/metinctl.in b/deploy/systemd/bin/metinctl.in
index 0628814..b9e6df8 100644
--- a/deploy/systemd/bin/metinctl.in
+++ b/deploy/systemd/bin/metinctl.in
@@ -18,6 +18,7 @@ REPO_ROOT = Path("{{REPO_ROOT}}")
 RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
 HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
 INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
+CORE_BACKTRACE_PATH = Path("/usr/local/sbin/metin-core-backtrace")
 INCIDENT_ROOT = Path("/var/lib/metin/incidents")
 AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
 SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
@@ -116,6 +117,10 @@ def parse_args() -> argparse.Namespace:
     incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
     incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
 
+    backtrace = subparsers.add_parser("backtrace", help="Generate a backtrace for the newest or selected core file")
+    backtrace.add_argument("--core", help="Core file path. Defaults to the newest core in the runtime tree.")
+    backtrace.add_argument("--exe", help="Executable path override.")
+
     auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
     auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
     auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
@@ -1194,6 +1199,19 @@ def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
     return 0
 
 
+def run_backtrace(core: str | None, exe: str | None) -> int:
+    if not CORE_BACKTRACE_PATH.exists():
+        raise SystemExit(f"Missing core backtrace helper: {CORE_BACKTRACE_PATH}")
+
+    command = [str(CORE_BACKTRACE_PATH)]
+    if core:
+        command.extend(["--core", core])
+    if exe:
+        command.extend(["--exe", exe])
+    run(command, require_root=True)
+    return 0
+
+
 def main() -> int:
     args = parse_args()
 
@@ -1231,6 +1249,8 @@ def main() -> int:
         return run_logs(args.target, args.lines, args.follow)
     if args.command == "incident-collect":
         return run_incident_collect(args.tag, args.since, args.include_cores)
+    if args.command == "backtrace":
+        return run_backtrace(args.core, args.exe)
     if args.command == "healthcheck":
         return run_healthcheck(args.mode)
     if args.command == "wait-ready":
diff --git a/deploy/systemd/install_systemd.py b/deploy/systemd/install_systemd.py
index c18f0a4..1ef2205 100644
--- a/deploy/systemd/install_systemd.py
+++ b/deploy/systemd/install_systemd.py
@@ -149,6 +149,11 @@ def main() -> int:
         render_template(BIN_DIR / "metin-collect-incident.in", template_values),
         0o700,
     )
+    write_text(
+        sbin_dir / "metin-core-backtrace",
+        render_template(BIN_DIR / "metin-core-backtrace.in", template_values),
+        0o700,
+    )
     copy_file(
         HEALTHCHECK_DIR / "metin-login-healthcheck.sh",
         sbin_dir / "metin-login-healthcheck",
diff --git a/docs/server-management.md b/docs/server-management.md
index b23cb83..88132f7 100644
--- a/docs/server-management.md
+++ b/docs/server-management.md
@@ -47,6 +47,7 @@ The Debian deployment installs:
 - restarting the whole stack or specific channels/instances
 - viewing logs
 - listing core files in the runtime tree
+- generating a backtrace for the newest or selected core file
 - collecting incident bundles
 - running the root-only headless healthcheck
 - waiting for login-ready state after restart
@@ -185,6 +186,18 @@ List core files currently present in the runtime tree:
 metinctl cores
 ```
 
+Generate a backtrace for the newest core file:
+
+```bash
+metinctl backtrace
+```
+
+Generate a backtrace for one specific core file:
+
+```bash
+metinctl backtrace --core channels/channel1/core1/core.2255450
+```
+
 Collect an incident bundle with logs, unit status, port state and repository revisions:
 
 ```bash
@@ -214,6 +227,7 @@ This makes channel enablement declarative instead of depending on whatever happe
 The Debian deployment now also installs:
 
 - `/usr/local/sbin/metin-collect-incident`
+- `/usr/local/sbin/metin-core-backtrace`
 
 The collector creates a timestamped bundle under:
 
@@ -231,3 +245,11 @@ Each bundle contains:
 If you call it with `--include-cores`, matching core files are copied into the bundle as well.
 
 The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
+
+For quick manual crash triage outside the incident bundle flow, use:
+
+```bash
+metinctl backtrace
+```
+
+It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable.