diff --git a/deploy/systemd/README.md b/deploy/systemd/README.md index 605087d..7bce2e0 100644 --- a/deploy/systemd/README.md +++ b/deploy/systemd/README.md @@ -34,11 +34,14 @@ The channel selection and port layout now come from the versioned inventory file - `/usr/local/libexec/metin-game-instance-start` - `/usr/local/libexec/metin-wait-port` - `/usr/local/bin/metinctl` +- `/usr/local/sbin/metin-collect-incident` The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start. The installer also reconciles enabled `metin-game@...` instances against the selected channel set so stale units do not stay enabled forever. +The runtime unit templates now also set `LimitCORE=infinity` for `db`, `auth`, and `game` services. + ## Optional Environment File The runtime units support an optional `EnvironmentFile` for host-local overrides: diff --git a/deploy/systemd/bin/metin-collect-incident.in b/deploy/systemd/bin/metin-collect-incident.in new file mode 100644 index 0000000..17f6ee3 --- /dev/null +++ b/deploy/systemd/bin/metin-collect-incident.in @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import shutil +import socket +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +REPO_ROOT = Path("{{REPO_ROOT}}") +RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") +INCIDENT_ROOT_DEFAULT = Path("/var/lib/metin/incidents") + +sys.path.insert(0, str(REPO_ROOT)) + +import channel_inventory + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Collect a Metin runtime incident bundle") + parser.add_argument("--tag", default="manual", help="Short incident tag used in the bundle directory name") + parser.add_argument("--since", default="-30 minutes", help="journalctl --since value") + parser.add_argument("--output-root", default=str(INCIDENT_ROOT_DEFAULT), help="Incident bundle root directory") + parser.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle") + return parser.parse_args() + + +def ensure_root() -> None: + if os.geteuid() != 0: + raise SystemExit("Run as root.") + + +def sanitize_tag(value: str) -> str: + filtered = "".join(char if char.isalnum() or char in {"-", "_"} else "-" for char in value.strip()) + return filtered or "manual" + + +def run(command: list[str], check: bool = True) -> subprocess.CompletedProcess[str]: + return subprocess.run(command, check=check, capture_output=True, text=True) + + +def write_text(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def write_command_output(bundle_dir: Path, filename: str, command: list[str], check: bool = False) -> None: + completed = run(command, check=check) + content = f"$ {' '.join(command)}\n\n" + if completed.stdout: + content += completed.stdout + if completed.stderr: + content += "\n[stderr]\n" + completed.stderr + write_text(bundle_dir / filename, content) + + +def copy_log_tails(bundle_dir: Path) -> None: + logs_dir = bundle_dir / "logs" + for path in sorted(RUNTIME_ROOT.glob("channels/**/syslog.log")) + sorted(RUNTIME_ROOT.glob("channels/**/syserr.log")): + if not path.is_file(): + continue + relative = path.relative_to(RUNTIME_ROOT) + destination = logs_dir / relative + destination.parent.mkdir(parents=True, exist_ok=True) + completed = run(["tail", "-n", "400", str(path)], check=False) + content = f"# tail -n 400 {path}\n\n" + if completed.stdout: + content += completed.stdout + if completed.stderr: + content += "\n[stderr]\n" + completed.stderr + destination.write_text(content, encoding="utf-8") + + +def find_core_files() -> list[Path]: + matches: list[Path] = [] + for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")): + if path.is_file(): + matches.append(path) + return matches + + +def write_core_metadata(bundle_dir: Path, core_files: list[Path]) -> None: + rows = [] + for path in core_files: + stat = path.stat() + rows.append( + { + "path": str(path), + "size_bytes": stat.st_size, + "mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(), + } + ) + write_text(bundle_dir / "core-files.json", json.dumps(rows, indent=2)) + + +def copy_core_files(bundle_dir: Path, core_files: list[Path]) -> None: + cores_dir = bundle_dir / "cores" + for path in core_files: + relative = path.relative_to(RUNTIME_ROOT) + destination = cores_dir / relative + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, destination) + + +def git_summary(repo_path: Path) -> dict[str, object]: + summary: dict[str, object] = {"path": str(repo_path), "present": repo_path.exists()} + if not repo_path.exists(): + return summary + + head = run(["git", "-C", str(repo_path), "rev-parse", "HEAD"], check=False) + status = run(["git", "-C", str(repo_path), "status", "--short"], check=False) + branch = run(["git", "-C", str(repo_path), "rev-parse", "--abbrev-ref", "HEAD"], check=False) + summary.update( + { + "head": head.stdout.strip(), + "branch": branch.stdout.strip(), + "dirty": bool(status.stdout.strip()), + "status": status.stdout.splitlines(), + } + ) + return summary + + +def main() -> int: + args = parse_args() + ensure_root() + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + tag = sanitize_tag(args.tag) + output_root = Path(args.output_root) + bundle_dir = output_root / f"{timestamp}-{tag}" + bundle_dir.mkdir(parents=True, exist_ok=False) + os.chmod(bundle_dir, 0o700) + + units = [ + channel_inventory.STACK_UNIT, + channel_inventory.DB_UNIT, + channel_inventory.DB_READY_UNIT, + channel_inventory.AUTH_UNIT, + *channel_inventory.get_game_units(), + ] + + source_repo = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src" + runtime_repo = REPO_ROOT + meta = { + "created_at": datetime.now(timezone.utc).isoformat(), + "hostname": socket.gethostname(), + "runtime_root": str(RUNTIME_ROOT), + "output_root": str(output_root), + "tag": tag, + "since": args.since, + "repos": { + "m2dev-server": git_summary(runtime_repo), + "m2dev-server-src": git_summary(source_repo), + }, + } + write_text(bundle_dir / "meta.json", json.dumps(meta, indent=2)) + + write_command_output(bundle_dir, "uname.txt", ["uname", "-a"]) + write_command_output(bundle_dir, "df.txt", ["df", "-h"]) + write_command_output(bundle_dir, "free.txt", ["free", "-h"], check=False) + write_command_output(bundle_dir, "ports.txt", ["ss", "-ltnp"], check=False) + write_command_output(bundle_dir, "systemctl-status.txt", ["systemctl", "status", "--no-pager", *units], check=False) + + journal_dir = bundle_dir / "journal" + for unit in units: + safe_name = unit.replace("@", "_").replace(".", "_") + write_command_output( + journal_dir, + f"{safe_name}.log", + ["journalctl", "--no-pager", "--since", args.since, "-u", unit], + check=False, + ) + + copy_log_tails(bundle_dir) + + core_files = find_core_files() + write_core_metadata(bundle_dir, core_files) + if args.include_cores and core_files: + copy_core_files(bundle_dir, core_files) + + print(bundle_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/deploy/systemd/bin/metinctl.in b/deploy/systemd/bin/metinctl.in index b29682f..119f663 100644 --- a/deploy/systemd/bin/metinctl.in +++ b/deploy/systemd/bin/metinctl.in @@ -12,6 +12,8 @@ from pathlib import Path REPO_ROOT = Path("{{REPO_ROOT}}") RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}") HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck") +INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident") +INCIDENT_ROOT = Path("/var/lib/metin/incidents") sys.path.insert(0, str(REPO_ROOT)) @@ -42,6 +44,17 @@ def parse_args() -> argparse.Namespace: logs_parser.add_argument("-n", "--lines", type=int, default=100, help="Number of journal lines") logs_parser.add_argument("-f", "--follow", action="store_true", help="Follow the journal") + cores_parser = subparsers.add_parser("cores", help="List core files under the runtime tree") + cores_parser.add_argument("--json", action="store_true", help="Print raw JSON") + + incidents_parser = subparsers.add_parser("incidents", help="List collected incident bundles") + incidents_parser.add_argument("--limit", type=int, default=10, help="Maximum number of bundles to show") + + incident_collect = subparsers.add_parser("incident-collect", help="Collect an incident bundle") + incident_collect.add_argument("--tag", default="manual", help="Short incident tag") + incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value") + incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle") + subparsers.add_parser("healthcheck", help="Run the root-only headless healthcheck") return parser.parse_args() @@ -121,6 +134,10 @@ def iter_port_rows() -> list[dict[str, str]]: return rows +def iter_core_files() -> list[Path]: + return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()] + + def live_ports() -> set[int]: if shutil.which("ss") is None: return set() @@ -236,6 +253,47 @@ def print_ports(show_live: bool) -> int: return 0 +def print_cores(as_json: bool) -> int: + entries = [] + for path in iter_core_files(): + stat = path.stat() + entries.append( + { + "path": str(path), + "relative_path": str(path.relative_to(RUNTIME_ROOT)), + "size_bytes": stat.st_size, + "mtime_epoch": int(stat.st_mtime), + } + ) + + if as_json: + print(json.dumps(entries, indent=2)) + return 0 + + if not entries: + print("No core files found under the runtime tree.") + return 0 + + rows = [[entry["relative_path"], str(entry["size_bytes"]), str(entry["mtime_epoch"])] for entry in entries] + print_table(["path", "size_bytes", "mtime_epoch"], rows) + return 0 + + +def print_incidents(limit: int) -> int: + if not INCIDENT_ROOT.exists(): + print(f"No incident directory: {INCIDENT_ROOT}") + return 0 + + bundles = sorted((path for path in INCIDENT_ROOT.iterdir() if path.is_dir()), reverse=True)[:limit] + if not bundles: + print(f"No incident bundles in {INCIDENT_ROOT}") + return 0 + + rows = [[bundle.name, str(bundle)] for bundle in bundles] + print_table(["bundle", "path"], rows) + return 0 + + def run_unit_action(action: str, target: str) -> int: units = resolve_target_units(target) run(["systemctl", action, *units], require_root=True) @@ -260,6 +318,17 @@ def run_healthcheck() -> int: return 0 +def run_incident_collect(tag: str, since: str, include_cores: bool) -> int: + if not INCIDENT_COLLECTOR_PATH.exists(): + raise SystemExit(f"Missing incident collector: {INCIDENT_COLLECTOR_PATH}") + + command = [str(INCIDENT_COLLECTOR_PATH), "--tag", tag, "--since", since] + if include_cores: + command.append("--include-cores") + run(command, require_root=True) + return 0 + + def main() -> int: args = parse_args() @@ -271,10 +340,16 @@ def main() -> int: return print_status(args.target) if args.command == "ports": return print_ports(args.live) + if args.command == "cores": + return print_cores(args.json) + if args.command == "incidents": + return print_incidents(args.limit) if args.command in {"start", "stop", "restart"}: return run_unit_action(args.command, args.target) if args.command == "logs": return run_logs(args.target, args.lines, args.follow) + if args.command == "incident-collect": + return run_incident_collect(args.tag, args.since, args.include_cores) if args.command == "healthcheck": return run_healthcheck() raise SystemExit(f"Unsupported command: {args.command}") diff --git a/deploy/systemd/install_systemd.py b/deploy/systemd/install_systemd.py index 2eb4e35..0167feb 100644 --- a/deploy/systemd/install_systemd.py +++ b/deploy/systemd/install_systemd.py @@ -25,6 +25,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--systemd-dir", default="/etc/systemd/system", help="systemd unit destination") parser.add_argument("--libexec-dir", default="/usr/local/libexec", help="Helper script destination") parser.add_argument("--bin-dir", default="/usr/local/bin", help="Binary/script destination") + parser.add_argument("--sbin-dir", default="/usr/local/sbin", help="Root-only binary/script destination") parser.add_argument("--env-file", default="/etc/metin/metin.env", help="Optional EnvironmentFile path for runtime overrides") parser.add_argument("--wait-host", default="127.0.0.1", help="DB readiness host") parser.add_argument("--wait-port", type=int, default=9000, help="DB readiness port") @@ -100,6 +101,7 @@ def main() -> int: systemd_dir = Path(args.systemd_dir) libexec_dir = Path(args.libexec_dir) bin_dir = Path(args.bin_dir) + sbin_dir = Path(args.sbin_dir) selected_channels = resolve_channels(args) instances = resolve_instances(selected_channels) @@ -141,6 +143,11 @@ def main() -> int: render_template(BIN_DIR / "metinctl.in", template_values), 0o755, ) + write_text( + sbin_dir / "metin-collect-incident", + render_template(BIN_DIR / "metin-collect-incident.in", template_values), + 0o700, + ) verify_units = [str(systemd_dir / unit_name) for unit_name in unit_names] run(["systemd-analyze", "verify", *verify_units]) diff --git a/deploy/systemd/templates/metin-auth.service.in b/deploy/systemd/templates/metin-auth.service.in index 05785b1..5019812 100644 --- a/deploy/systemd/templates/metin-auth.service.in +++ b/deploy/systemd/templates/metin-auth.service.in @@ -18,6 +18,7 @@ RestartSec=5 KillSignal=SIGTERM TimeoutStopSec=60 LimitNOFILE=65535 +LimitCORE=infinity [Install] RequiredBy=metin-server.service diff --git a/deploy/systemd/templates/metin-db.service.in b/deploy/systemd/templates/metin-db.service.in index 63cecba..9911673 100644 --- a/deploy/systemd/templates/metin-db.service.in +++ b/deploy/systemd/templates/metin-db.service.in @@ -18,6 +18,7 @@ RestartSec=5 KillSignal=SIGTERM TimeoutStopSec=180 LimitNOFILE=65535 +LimitCORE=infinity [Install] RequiredBy=metin-server.service diff --git a/deploy/systemd/templates/metin-game@.service.in b/deploy/systemd/templates/metin-game@.service.in index 3c5552a..576f42f 100644 --- a/deploy/systemd/templates/metin-game@.service.in +++ b/deploy/systemd/templates/metin-game@.service.in @@ -18,6 +18,7 @@ RestartSec=5 KillSignal=SIGTERM TimeoutStopSec=60 LimitNOFILE=65535 +LimitCORE=infinity [Install] RequiredBy=metin-server.service diff --git a/docs/server-management.md b/docs/server-management.md index 3c7bec9..8244868 100644 --- a/docs/server-management.md +++ b/docs/server-management.md @@ -38,6 +38,8 @@ The Debian deployment installs: - listing declared ports - restarting the whole stack or specific channels/instances - viewing logs +- listing core files in the runtime tree +- collecting incident bundles - running the root-only headless healthcheck ## Examples @@ -84,6 +86,24 @@ Run the end-to-end healthcheck: metinctl healthcheck ``` +List core files currently present in the runtime tree: + +```bash +metinctl cores +``` + +Collect an incident bundle with logs, unit status, port state and repository revisions: + +```bash +metinctl incident-collect --tag auth-timeout --since "-20 minutes" +``` + +List the most recent incident bundles: + +```bash +metinctl incidents +``` + ## systemd installer behavior `deploy/systemd/install_systemd.py` now uses the same inventory and installs `metinctl`. @@ -95,3 +115,26 @@ It also reconciles enabled game instance units against the selected channels: - if `--restart` is passed, stale game units are disabled with `--now` This makes channel enablement declarative instead of depending on whatever happened to be enabled previously. + +## Crash / Incident Pipeline + +The Debian deployment now also installs: + +- `/usr/local/sbin/metin-collect-incident` + +The collector creates a timestamped bundle under: + +- `/var/lib/metin/incidents` + +Each bundle contains: + +- repo revisions for `m2dev-server` and `m2dev-server-src` +- `systemctl status` for the whole stack +- recent `journalctl` output per unit +- listener state from `ss -ltnp` +- tailed runtime `syslog.log` and `syserr.log` files +- metadata for any `core*` files found under `runtime/server/channels` + +If you call it with `--include-cores`, matching core files are copied into the bundle as well. + +The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.