ops: add core backtrace helper
This commit is contained in:
@@ -35,6 +35,7 @@ The channel selection and port layout now come from the versioned inventory file
|
|||||||
- `/usr/local/libexec/metin-wait-port`
|
- `/usr/local/libexec/metin-wait-port`
|
||||||
- `/usr/local/bin/metinctl`
|
- `/usr/local/bin/metinctl`
|
||||||
- `/usr/local/sbin/metin-collect-incident`
|
- `/usr/local/sbin/metin-collect-incident`
|
||||||
|
- `/usr/local/sbin/metin-core-backtrace`
|
||||||
|
|
||||||
The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
|
The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
|
||||||
|
|
||||||
|
|||||||
197
deploy/systemd/bin/metin-core-backtrace.in
Normal file
197
deploy/systemd/bin/metin-core-backtrace.in
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Generate a backtrace for a Metin runtime core file")
|
||||||
|
parser.add_argument("--core", help="Core file path. Defaults to the newest core under the runtime tree.")
|
||||||
|
parser.add_argument("--exe", help="Executable path override. If omitted, infer it from the core path.")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_root() -> None:
|
||||||
|
if os.geteuid() != 0:
|
||||||
|
raise SystemExit("Run as root.")
|
||||||
|
|
||||||
|
|
||||||
|
def run(command: list[str], check: bool = False) -> subprocess.CompletedProcess[str]:
|
||||||
|
return subprocess.run(command, check=check, capture_output=True, text=True)
|
||||||
|
|
||||||
|
|
||||||
|
def iter_core_files() -> list[Path]:
|
||||||
|
return sorted(
|
||||||
|
(path for path in RUNTIME_ROOT.glob("channels/**/core*") if path.is_file()),
|
||||||
|
key=lambda path: path.stat().st_mtime,
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_core_path(core_arg: str | None) -> Path:
|
||||||
|
if core_arg:
|
||||||
|
candidate = Path(core_arg)
|
||||||
|
if not candidate.is_absolute():
|
||||||
|
runtime_relative = RUNTIME_ROOT / core_arg
|
||||||
|
if runtime_relative.exists():
|
||||||
|
candidate = runtime_relative
|
||||||
|
candidate = candidate.resolve()
|
||||||
|
if not candidate.is_file():
|
||||||
|
raise SystemExit(f"Core file not found: {candidate}")
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
cores = iter_core_files()
|
||||||
|
if not cores:
|
||||||
|
raise SystemExit(f"No core files found under {RUNTIME_ROOT}")
|
||||||
|
return cores[0]
|
||||||
|
|
||||||
|
|
||||||
|
def infer_execfn_from_file_output(core_path: Path) -> Path | None:
|
||||||
|
completed = run(["file", str(core_path)])
|
||||||
|
if completed.returncode != 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
match = re.search(r"execfn: '([^']+)'", completed.stdout)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidate = Path(match.group(1))
|
||||||
|
if candidate.is_file():
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def infer_executable(core_path: Path, exe_arg: str | None) -> Path:
|
||||||
|
if exe_arg:
|
||||||
|
exe_path = Path(exe_arg).resolve()
|
||||||
|
if not exe_path.is_file():
|
||||||
|
raise SystemExit(f"Executable not found: {exe_path}")
|
||||||
|
return exe_path
|
||||||
|
|
||||||
|
parent_name = core_path.parent.name
|
||||||
|
grandparent_name = core_path.parent.parent.name if core_path.parent.parent else ""
|
||||||
|
|
||||||
|
candidates: list[Path] = []
|
||||||
|
if parent_name == "db":
|
||||||
|
candidates.append(core_path.parent / "db")
|
||||||
|
elif parent_name == "auth":
|
||||||
|
candidates.append(core_path.parent / "game_auth")
|
||||||
|
elif parent_name.startswith("core") and grandparent_name.startswith("channel"):
|
||||||
|
candidates.append(core_path.parent / f"{grandparent_name}_{parent_name}")
|
||||||
|
|
||||||
|
execfn_candidate = infer_execfn_from_file_output(core_path)
|
||||||
|
if execfn_candidate:
|
||||||
|
candidates.append(execfn_candidate)
|
||||||
|
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate.is_file():
|
||||||
|
return candidate.resolve()
|
||||||
|
|
||||||
|
raise SystemExit(f"Could not infer executable for core file: {core_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def preferred_debugger() -> str | None:
|
||||||
|
for tool in ("gdb", "lldb"):
|
||||||
|
if shutil.which(tool):
|
||||||
|
return tool
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def format_section(title: str, body: str) -> str:
|
||||||
|
return f"== {title} ==\n{body.rstrip()}\n"
|
||||||
|
|
||||||
|
|
||||||
|
def render_file_info(path: Path) -> str:
|
||||||
|
completed = run(["file", str(path)])
|
||||||
|
body = completed.stdout or completed.stderr or "<no output>"
|
||||||
|
return format_section(f"file {path}", body)
|
||||||
|
|
||||||
|
|
||||||
|
def render_readelf_notes(core_path: Path) -> str:
|
||||||
|
if not shutil.which("readelf"):
|
||||||
|
return ""
|
||||||
|
completed = run(["readelf", "-n", str(core_path)])
|
||||||
|
body = completed.stdout or completed.stderr or "<no output>"
|
||||||
|
return format_section(f"readelf -n {core_path}", body)
|
||||||
|
|
||||||
|
|
||||||
|
def render_debugger_backtrace(debugger: str, exe_path: Path, core_path: Path) -> str:
|
||||||
|
if debugger == "gdb":
|
||||||
|
command = [
|
||||||
|
"gdb",
|
||||||
|
"-batch",
|
||||||
|
"-ex",
|
||||||
|
"set pagination off",
|
||||||
|
"-ex",
|
||||||
|
"thread apply all bt full",
|
||||||
|
str(exe_path),
|
||||||
|
str(core_path),
|
||||||
|
]
|
||||||
|
elif debugger == "lldb":
|
||||||
|
command = [
|
||||||
|
"lldb",
|
||||||
|
"--batch",
|
||||||
|
"-o",
|
||||||
|
"thread backtrace all",
|
||||||
|
"-c",
|
||||||
|
str(core_path),
|
||||||
|
str(exe_path),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
raise SystemExit(f"Unsupported debugger: {debugger}")
|
||||||
|
|
||||||
|
completed = run(command)
|
||||||
|
output = completed.stdout or completed.stderr or "<no output>"
|
||||||
|
return format_section("backtrace", f"$ {' '.join(command)}\n\n{output}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
args = parse_args()
|
||||||
|
ensure_root()
|
||||||
|
|
||||||
|
core_path = resolve_core_path(args.core)
|
||||||
|
exe_path = infer_executable(core_path, args.exe)
|
||||||
|
debugger = preferred_debugger()
|
||||||
|
|
||||||
|
sections = [
|
||||||
|
format_section(
|
||||||
|
"summary",
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
f"core: {core_path}",
|
||||||
|
f"executable: {exe_path}",
|
||||||
|
f"debugger: {debugger or '<none>'}",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
),
|
||||||
|
render_file_info(core_path),
|
||||||
|
render_file_info(exe_path),
|
||||||
|
]
|
||||||
|
|
||||||
|
readelf_section = render_readelf_notes(core_path)
|
||||||
|
if readelf_section:
|
||||||
|
sections.append(readelf_section)
|
||||||
|
|
||||||
|
if debugger:
|
||||||
|
sections.append(render_debugger_backtrace(debugger, exe_path, core_path))
|
||||||
|
else:
|
||||||
|
sections.append(
|
||||||
|
format_section(
|
||||||
|
"backtrace",
|
||||||
|
"No supported debugger found. Install gdb or lldb on the host to generate a stack trace.",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n".join(section.rstrip() for section in sections if section).rstrip())
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -18,6 +18,7 @@ REPO_ROOT = Path("{{REPO_ROOT}}")
|
|||||||
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
|
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
|
||||||
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
|
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
|
||||||
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
|
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
|
||||||
|
CORE_BACKTRACE_PATH = Path("/usr/local/sbin/metin-core-backtrace")
|
||||||
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
||||||
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
|
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
|
||||||
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
||||||
@@ -116,6 +117,10 @@ def parse_args() -> argparse.Namespace:
|
|||||||
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
|
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
|
||||||
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
|
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
|
||||||
|
|
||||||
|
backtrace = subparsers.add_parser("backtrace", help="Generate a backtrace for the newest or selected core file")
|
||||||
|
backtrace.add_argument("--core", help="Core file path. Defaults to the newest core in the runtime tree.")
|
||||||
|
backtrace.add_argument("--exe", help="Executable path override.")
|
||||||
|
|
||||||
auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
|
auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
|
||||||
auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||||
auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
|
auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
|
||||||
@@ -1194,6 +1199,19 @@ def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def run_backtrace(core: str | None, exe: str | None) -> int:
|
||||||
|
if not CORE_BACKTRACE_PATH.exists():
|
||||||
|
raise SystemExit(f"Missing core backtrace helper: {CORE_BACKTRACE_PATH}")
|
||||||
|
|
||||||
|
command = [str(CORE_BACKTRACE_PATH)]
|
||||||
|
if core:
|
||||||
|
command.extend(["--core", core])
|
||||||
|
if exe:
|
||||||
|
command.extend(["--exe", exe])
|
||||||
|
run(command, require_root=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
def main() -> int:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
@@ -1231,6 +1249,8 @@ def main() -> int:
|
|||||||
return run_logs(args.target, args.lines, args.follow)
|
return run_logs(args.target, args.lines, args.follow)
|
||||||
if args.command == "incident-collect":
|
if args.command == "incident-collect":
|
||||||
return run_incident_collect(args.tag, args.since, args.include_cores)
|
return run_incident_collect(args.tag, args.since, args.include_cores)
|
||||||
|
if args.command == "backtrace":
|
||||||
|
return run_backtrace(args.core, args.exe)
|
||||||
if args.command == "healthcheck":
|
if args.command == "healthcheck":
|
||||||
return run_healthcheck(args.mode)
|
return run_healthcheck(args.mode)
|
||||||
if args.command == "wait-ready":
|
if args.command == "wait-ready":
|
||||||
|
|||||||
@@ -149,6 +149,11 @@ def main() -> int:
|
|||||||
render_template(BIN_DIR / "metin-collect-incident.in", template_values),
|
render_template(BIN_DIR / "metin-collect-incident.in", template_values),
|
||||||
0o700,
|
0o700,
|
||||||
)
|
)
|
||||||
|
write_text(
|
||||||
|
sbin_dir / "metin-core-backtrace",
|
||||||
|
render_template(BIN_DIR / "metin-core-backtrace.in", template_values),
|
||||||
|
0o700,
|
||||||
|
)
|
||||||
copy_file(
|
copy_file(
|
||||||
HEALTHCHECK_DIR / "metin-login-healthcheck.sh",
|
HEALTHCHECK_DIR / "metin-login-healthcheck.sh",
|
||||||
sbin_dir / "metin-login-healthcheck",
|
sbin_dir / "metin-login-healthcheck",
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ The Debian deployment installs:
|
|||||||
- restarting the whole stack or specific channels/instances
|
- restarting the whole stack or specific channels/instances
|
||||||
- viewing logs
|
- viewing logs
|
||||||
- listing core files in the runtime tree
|
- listing core files in the runtime tree
|
||||||
|
- generating a backtrace for the newest or selected core file
|
||||||
- collecting incident bundles
|
- collecting incident bundles
|
||||||
- running the root-only headless healthcheck
|
- running the root-only headless healthcheck
|
||||||
- waiting for login-ready state after restart
|
- waiting for login-ready state after restart
|
||||||
@@ -185,6 +186,18 @@ List core files currently present in the runtime tree:
|
|||||||
metinctl cores
|
metinctl cores
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Generate a backtrace for the newest core file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
metinctl backtrace
|
||||||
|
```
|
||||||
|
|
||||||
|
Generate a backtrace for one specific core file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
metinctl backtrace --core channels/channel1/core1/core.2255450
|
||||||
|
```
|
||||||
|
|
||||||
Collect an incident bundle with logs, unit status, port state and repository revisions:
|
Collect an incident bundle with logs, unit status, port state and repository revisions:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -214,6 +227,7 @@ This makes channel enablement declarative instead of depending on whatever happe
|
|||||||
The Debian deployment now also installs:
|
The Debian deployment now also installs:
|
||||||
|
|
||||||
- `/usr/local/sbin/metin-collect-incident`
|
- `/usr/local/sbin/metin-collect-incident`
|
||||||
|
- `/usr/local/sbin/metin-core-backtrace`
|
||||||
|
|
||||||
The collector creates a timestamped bundle under:
|
The collector creates a timestamped bundle under:
|
||||||
|
|
||||||
@@ -231,3 +245,11 @@ Each bundle contains:
|
|||||||
If you call it with `--include-cores`, matching core files are copied into the bundle as well.
|
If you call it with `--include-cores`, matching core files are copied into the bundle as well.
|
||||||
|
|
||||||
The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
|
The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
|
||||||
|
|
||||||
|
For quick manual crash triage outside the incident bundle flow, use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
metinctl backtrace
|
||||||
|
```
|
||||||
|
|
||||||
|
It defaults to the newest core file under the runtime tree, infers the executable path, and uses `gdb` or `lldb` when present on the host. If no supported debugger is installed, it still prints file/readelf metadata for the core and executable.
|
||||||
|
|||||||
Reference in New Issue
Block a user