ops: add core backtrace helper

This commit is contained in:
server
2026-04-14 17:01:50 +02:00
parent c5bb515781
commit 0bc6559283
5 changed files with 245 additions and 0 deletions

View File

@@ -35,6 +35,7 @@ The channel selection and port layout now come from the versioned inventory file
- `/usr/local/libexec/metin-wait-port` - `/usr/local/libexec/metin-wait-port`
- `/usr/local/bin/metinctl` - `/usr/local/bin/metinctl`
- `/usr/local/sbin/metin-collect-incident` - `/usr/local/sbin/metin-collect-incident`
- `/usr/local/sbin/metin-core-backtrace`
The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start. The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.

View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import re
import shutil
import subprocess
from pathlib import Path
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate a backtrace for a Metin runtime core file")
parser.add_argument("--core", help="Core file path. Defaults to the newest core under the runtime tree.")
parser.add_argument("--exe", help="Executable path override. If omitted, infer it from the core path.")
return parser.parse_args()
def ensure_root() -> None:
if os.geteuid() != 0:
raise SystemExit("Run as root.")
def run(command: list[str], check: bool = False) -> subprocess.CompletedProcess[str]:
return subprocess.run(command, check=check, capture_output=True, text=True)
def iter_core_files() -> list[Path]:
return sorted(
(path for path in RUNTIME_ROOT.glob("channels/**/core*") if path.is_file()),
key=lambda path: path.stat().st_mtime,
reverse=True,
)
def resolve_core_path(core_arg: str | None) -> Path:
if core_arg:
candidate = Path(core_arg)
if not candidate.is_absolute():
runtime_relative = RUNTIME_ROOT / core_arg
if runtime_relative.exists():
candidate = runtime_relative
candidate = candidate.resolve()
if not candidate.is_file():
raise SystemExit(f"Core file not found: {candidate}")
return candidate
cores = iter_core_files()
if not cores:
raise SystemExit(f"No core files found under {RUNTIME_ROOT}")
return cores[0]
def infer_execfn_from_file_output(core_path: Path) -> Path | None:
completed = run(["file", str(core_path)])
if completed.returncode != 0:
return None
match = re.search(r"execfn: '([^']+)'", completed.stdout)
if not match:
return None
candidate = Path(match.group(1))
if candidate.is_file():
return candidate
return None
def infer_executable(core_path: Path, exe_arg: str | None) -> Path:
if exe_arg:
exe_path = Path(exe_arg).resolve()
if not exe_path.is_file():
raise SystemExit(f"Executable not found: {exe_path}")
return exe_path
parent_name = core_path.parent.name
grandparent_name = core_path.parent.parent.name if core_path.parent.parent else ""
candidates: list[Path] = []
if parent_name == "db":
candidates.append(core_path.parent / "db")
elif parent_name == "auth":
candidates.append(core_path.parent / "game_auth")
elif parent_name.startswith("core") and grandparent_name.startswith("channel"):
candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}")
execfn_candidate = infer_execfn_from_file_output(core_path)
if execfn_candidate:
candidates.append(execfn_candidate)
for candidate in candidates:
if candidate.is_file():
return candidate.resolve()
raise SystemExit(f"Could not infer executable for core file: {core_path}")
def preferred_debugger() -> str | None:
for tool in ("gdb", "lldb"):
if shutil.which(tool):
return tool
return None
def format_section(title: str, body: str) -> str:
return f"== {title} ==\n{body.rstrip()}\n"
def render_file_info(path: Path) -> str:
completed = run(["file", str(path)])
body = completed.stdout or completed.stderr or "<no output>"
return format_section(f"file {path}", body)
def render_readelf_notes(core_path: Path) -> str:
if not shutil.which("readelf"):
return ""
completed = run(["readelf", "-n", str(core_path)])
body = completed.stdout or completed.stderr or "<no output>"
return format_section(f"readelf -n {core_path}", body)
def render_debugger_backtrace(debugger: str, exe_path: Path, core_path: Path) -> str:
if debugger == "gdb":
command = [
"gdb",
"-batch",
"-ex",
"set pagination off",
"-ex",
"thread apply all bt full",
str(exe_path),
str(core_path),
]
elif debugger == "lldb":
command = [
"lldb",
"--batch",
"-o",
"thread backtrace all",
"-c",
str(core_path),
str(exe_path),
]
else:
raise SystemExit(f"Unsupported debugger: {debugger}")
completed = run(command)
output = completed.stdout or completed.stderr or "<no output>"
return format_section("backtrace", f"$ {' '.join(command)}\n\n{output}")
def main() -> int:
args = parse_args()
ensure_root()
core_path = resolve_core_path(args.core)
exe_path = infer_executable(core_path, args.exe)
debugger = preferred_debugger()
sections = [
format_section(
"summary",
"\n".join(
[
f"core: {core_path}",
f"executable: {exe_path}",
f"debugger: {debugger or '<none>'}",
]
),
),
render_file_info(core_path),
render_file_info(exe_path),
]
readelf_section = render_readelf_notes(core_path)
if readelf_section:
sections.append(readelf_section)
if debugger:
sections.append(render_debugger_backtrace(debugger, exe_path, core_path))
else:
sections.append(
format_section(
"backtrace",
"No supported debugger found. Install gdb or lldb on the host to generate a stack trace.",
)
)
print("\n".join(section.rstrip() for section in sections if section).rstrip())
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -18,6 +18,7 @@ REPO_ROOT = Path("{{REPO_ROOT}}")
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck") HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident") INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
CORE_BACKTRACE_PATH = Path("/usr/local/sbin/metin-core-backtrace")
INCIDENT_ROOT = Path("/var/lib/metin/incidents") INCIDENT_ROOT = Path("/var/lib/metin/incidents")
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log" AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src" SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
@@ -116,6 +117,10 @@ def parse_args() -> argparse.Namespace:
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value") incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle") incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
backtrace = subparsers.add_parser("backtrace", help="Generate a backtrace for the newest or selected core file")
backtrace.add_argument("--core", help="Core file path. Defaults to the newest core in the runtime tree.")
backtrace.add_argument("--exe", help="Executable path override.")
auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog") auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect") auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show") auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
@@ -1194,6 +1199,19 @@ def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
return 0 return 0
def run_backtrace(core: str | None, exe: str | None) -> int:
if not CORE_BACKTRACE_PATH.exists():
raise SystemExit(f"Missing core backtrace helper: {CORE_BACKTRACE_PATH}")
command = [str(CORE_BACKTRACE_PATH)]
if core:
command.extend(["--core", core])
if exe:
command.extend(["--exe", exe])
run(command, require_root=True)
return 0
def main() -> int: def main() -> int:
args = parse_args() args = parse_args()
@@ -1231,6 +1249,8 @@ def main() -> int:
return run_logs(args.target, args.lines, args.follow) return run_logs(args.target, args.lines, args.follow)
if args.command == "incident-collect": if args.command == "incident-collect":
return run_incident_collect(args.tag, args.since, args.include_cores) return run_incident_collect(args.tag, args.since, args.include_cores)
if args.command == "backtrace":
return run_backtrace(args.core, args.exe)
if args.command == "healthcheck": if args.command == "healthcheck":
return run_healthcheck(args.mode) return run_healthcheck(args.mode)
if args.command == "wait-ready": if args.command == "wait-ready":

View File

@@ -149,6 +149,11 @@ def main() -> int:
render_template(BIN_DIR / "metin-collect-incident.in", template_values), render_template(BIN_DIR / "metin-collect-incident.in", template_values),
0o700, 0o700,
) )
write_text(
sbin_dir / "metin-core-backtrace",
render_template(BIN_DIR / "metin-core-backtrace.in", template_values),
0o700,
)
copy_file( copy_file(
HEALTHCHECK_DIR / "metin-login-healthcheck.sh", HEALTHCHECK_DIR / "metin-login-healthcheck.sh",
sbin_dir / "metin-login-healthcheck", sbin_dir / "metin-login-healthcheck",

View File

@@ -47,6 +47,7 @@ The Debian deployment installs:
- restarting the whole stack or specific channels/instances - restarting the whole stack or specific channels/instances
- viewing logs - viewing logs
- listing core files in the runtime tree - listing core files in the runtime tree
- generating a backtrace for the newest or selected core file
- collecting incident bundles - collecting incident bundles
- running the root-only headless healthcheck - running the root-only headless healthcheck
- waiting for login-ready state after restart - waiting for login-ready state after restart
@@ -185,6 +186,18 @@ List core files currently present in the runtime tree:
metinctl cores metinctl cores
``` ```
Generate a backtrace for the newest core file:
```bash
metinctl backtrace
```
Generate a backtrace for one specific core file:
```bash
metinctl backtrace --core channels/channel1/core1/core.2255450
```
Collect an incident bundle with logs, unit status, port state and repository revisions: Collect an incident bundle with logs, unit status, port state and repository revisions:
```bash ```bash
@@ -214,6 +227,7 @@ This makes channel enablement declarative instead of depending on whatever happe
The Debian deployment now also installs: The Debian deployment now also installs:
- `/usr/local/sbin/metin-collect-incident` - `/usr/local/sbin/metin-collect-incident`
- `/usr/local/sbin/metin-core-backtrace`
The collector creates a timestamped bundle under: The collector creates a timestamped bundle under:
@@ -231,3 +245,11 @@ Each bundle contains:
If you call it with `--include-cores`, matching core files are copied into the bundle as well. If you call it with `--include-cores`, matching core files are copied into the bundle as well.
The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it. The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
For quick manual crash triage outside the incident bundle flow, use:
```bash
metinctl backtrace
```
It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable.