ops: add incident collection pipeline
This commit is contained in:
@@ -34,11 +34,14 @@ The channel selection and port layout now come from the versioned inventory file
|
||||
- `/usr/local/libexec/metin-game-instance-start`
|
||||
- `/usr/local/libexec/metin-wait-port`
|
||||
- `/usr/local/bin/metinctl`
|
||||
- `/usr/local/sbin/metin-collect-incident`
|
||||
|
||||
The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
|
||||
|
||||
The installer also reconciles enabled `metin-game@...` instances against the selected channel set so stale units do not stay enabled forever.
|
||||
|
||||
The runtime unit templates now also set `LimitCORE=infinity` for `db`, `auth`, and `game` services.
|
||||
|
||||
## Optional Environment File
|
||||
|
||||
The runtime units support an optional `EnvironmentFile` for host-local overrides:
|
||||
|
||||
191
deploy/systemd/bin/metin-collect-incident.in
Normal file
191
deploy/systemd/bin/metin-collect-incident.in
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path("{{REPO_ROOT}}")
|
||||
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
|
||||
INCIDENT_ROOT_DEFAULT = Path("/var/lib/metin/incidents")
|
||||
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
import channel_inventory
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Collect a Metin runtime incident bundle")
|
||||
parser.add_argument("--tag", default="manual", help="Short incident tag used in the bundle directory name")
|
||||
parser.add_argument("--since", default="-30 minutes", help="journalctl --since value")
|
||||
parser.add_argument("--output-root", default=str(INCIDENT_ROOT_DEFAULT), help="Incident bundle root directory")
|
||||
parser.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def ensure_root() -> None:
|
||||
if os.geteuid() != 0:
|
||||
raise SystemExit("Run as root.")
|
||||
|
||||
|
||||
def sanitize_tag(value: str) -> str:
|
||||
filtered = "".join(char if char.isalnum() or char in {"-", "_"} else "-" for char in value.strip())
|
||||
return filtered or "manual"
|
||||
|
||||
|
||||
def run(command: list[str], check: bool = True) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(command, check=check, capture_output=True, text=True)
|
||||
|
||||
|
||||
def write_text(path: Path, content: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def write_command_output(bundle_dir: Path, filename: str, command: list[str], check: bool = False) -> None:
|
||||
completed = run(command, check=check)
|
||||
content = f"$ {' '.join(command)}\n\n"
|
||||
if completed.stdout:
|
||||
content += completed.stdout
|
||||
if completed.stderr:
|
||||
content += "\n[stderr]\n" + completed.stderr
|
||||
write_text(bundle_dir / filename, content)
|
||||
|
||||
|
||||
def copy_log_tails(bundle_dir: Path) -> None:
|
||||
logs_dir = bundle_dir / "logs"
|
||||
for path in sorted(RUNTIME_ROOT.glob("channels/**/syslog.log")) + sorted(RUNTIME_ROOT.glob("channels/**/syserr.log")):
|
||||
if not path.is_file():
|
||||
continue
|
||||
relative = path.relative_to(RUNTIME_ROOT)
|
||||
destination = logs_dir / relative
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
completed = run(["tail", "-n", "400", str(path)], check=False)
|
||||
content = f"# tail -n 400 {path}\n\n"
|
||||
if completed.stdout:
|
||||
content += completed.stdout
|
||||
if completed.stderr:
|
||||
content += "\n[stderr]\n" + completed.stderr
|
||||
destination.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def find_core_files() -> list[Path]:
|
||||
matches: list[Path] = []
|
||||
for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")):
|
||||
if path.is_file():
|
||||
matches.append(path)
|
||||
return matches
|
||||
|
||||
|
||||
def write_core_metadata(bundle_dir: Path, core_files: list[Path]) -> None:
|
||||
rows = []
|
||||
for path in core_files:
|
||||
stat = path.stat()
|
||||
rows.append(
|
||||
{
|
||||
"path": str(path),
|
||||
"size_bytes": stat.st_size,
|
||||
"mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
|
||||
}
|
||||
)
|
||||
write_text(bundle_dir / "core-files.json", json.dumps(rows, indent=2))
|
||||
|
||||
|
||||
def copy_core_files(bundle_dir: Path, core_files: list[Path]) -> None:
|
||||
cores_dir = bundle_dir / "cores"
|
||||
for path in core_files:
|
||||
relative = path.relative_to(RUNTIME_ROOT)
|
||||
destination = cores_dir / relative
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(path, destination)
|
||||
|
||||
|
||||
def git_summary(repo_path: Path) -> dict[str, object]:
|
||||
summary: dict[str, object] = {"path": str(repo_path), "present": repo_path.exists()}
|
||||
if not repo_path.exists():
|
||||
return summary
|
||||
|
||||
head = run(["git", "-C", str(repo_path), "rev-parse", "HEAD"], check=False)
|
||||
status = run(["git", "-C", str(repo_path), "status", "--short"], check=False)
|
||||
branch = run(["git", "-C", str(repo_path), "rev-parse", "--abbrev-ref", "HEAD"], check=False)
|
||||
summary.update(
|
||||
{
|
||||
"head": head.stdout.strip(),
|
||||
"branch": branch.stdout.strip(),
|
||||
"dirty": bool(status.stdout.strip()),
|
||||
"status": status.stdout.splitlines(),
|
||||
}
|
||||
)
|
||||
return summary
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
ensure_root()
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
tag = sanitize_tag(args.tag)
|
||||
output_root = Path(args.output_root)
|
||||
bundle_dir = output_root / f"{timestamp}-{tag}"
|
||||
bundle_dir.mkdir(parents=True, exist_ok=False)
|
||||
os.chmod(bundle_dir, 0o700)
|
||||
|
||||
units = [
|
||||
channel_inventory.STACK_UNIT,
|
||||
channel_inventory.DB_UNIT,
|
||||
channel_inventory.DB_READY_UNIT,
|
||||
channel_inventory.AUTH_UNIT,
|
||||
*channel_inventory.get_game_units(),
|
||||
]
|
||||
|
||||
source_repo = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
||||
runtime_repo = REPO_ROOT
|
||||
meta = {
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
"hostname": socket.gethostname(),
|
||||
"runtime_root": str(RUNTIME_ROOT),
|
||||
"output_root": str(output_root),
|
||||
"tag": tag,
|
||||
"since": args.since,
|
||||
"repos": {
|
||||
"m2dev-server": git_summary(runtime_repo),
|
||||
"m2dev-server-src": git_summary(source_repo),
|
||||
},
|
||||
}
|
||||
write_text(bundle_dir / "meta.json", json.dumps(meta, indent=2))
|
||||
|
||||
write_command_output(bundle_dir, "uname.txt", ["uname", "-a"])
|
||||
write_command_output(bundle_dir, "df.txt", ["df", "-h"])
|
||||
write_command_output(bundle_dir, "free.txt", ["free", "-h"], check=False)
|
||||
write_command_output(bundle_dir, "ports.txt", ["ss", "-ltnp"], check=False)
|
||||
write_command_output(bundle_dir, "systemctl-status.txt", ["systemctl", "status", "--no-pager", *units], check=False)
|
||||
|
||||
journal_dir = bundle_dir / "journal"
|
||||
for unit in units:
|
||||
safe_name = unit.replace("@", "_").replace(".", "_")
|
||||
write_command_output(
|
||||
journal_dir,
|
||||
f"{safe_name}.log",
|
||||
["journalctl", "--no-pager", "--since", args.since, "-u", unit],
|
||||
check=False,
|
||||
)
|
||||
|
||||
copy_log_tails(bundle_dir)
|
||||
|
||||
core_files = find_core_files()
|
||||
write_core_metadata(bundle_dir, core_files)
|
||||
if args.include_cores and core_files:
|
||||
copy_core_files(bundle_dir, core_files)
|
||||
|
||||
print(bundle_dir)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -12,6 +12,8 @@ from pathlib import Path
|
||||
REPO_ROOT = Path("{{REPO_ROOT}}")
|
||||
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
|
||||
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
|
||||
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
|
||||
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
||||
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
@@ -42,6 +44,17 @@ def parse_args() -> argparse.Namespace:
|
||||
logs_parser.add_argument("-n", "--lines", type=int, default=100, help="Number of journal lines")
|
||||
logs_parser.add_argument("-f", "--follow", action="store_true", help="Follow the journal")
|
||||
|
||||
cores_parser = subparsers.add_parser("cores", help="List core files under the runtime tree")
|
||||
cores_parser.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||
|
||||
incidents_parser = subparsers.add_parser("incidents", help="List collected incident bundles")
|
||||
incidents_parser.add_argument("--limit", type=int, default=10, help="Maximum number of bundles to show")
|
||||
|
||||
incident_collect = subparsers.add_parser("incident-collect", help="Collect an incident bundle")
|
||||
incident_collect.add_argument("--tag", default="manual", help="Short incident tag")
|
||||
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
|
||||
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
|
||||
|
||||
subparsers.add_parser("healthcheck", help="Run the root-only headless healthcheck")
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -121,6 +134,10 @@ def iter_port_rows() -> list[dict[str, str]]:
|
||||
return rows
|
||||
|
||||
|
||||
def iter_core_files() -> list[Path]:
|
||||
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
|
||||
|
||||
|
||||
def live_ports() -> set[int]:
|
||||
if shutil.which("ss") is None:
|
||||
return set()
|
||||
@@ -236,6 +253,47 @@ def print_ports(show_live: bool) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def print_cores(as_json: bool) -> int:
|
||||
entries = []
|
||||
for path in iter_core_files():
|
||||
stat = path.stat()
|
||||
entries.append(
|
||||
{
|
||||
"path": str(path),
|
||||
"relative_path": str(path.relative_to(RUNTIME_ROOT)),
|
||||
"size_bytes": stat.st_size,
|
||||
"mtime_epoch": int(stat.st_mtime),
|
||||
}
|
||||
)
|
||||
|
||||
if as_json:
|
||||
print(json.dumps(entries, indent=2))
|
||||
return 0
|
||||
|
||||
if not entries:
|
||||
print("No core files found under the runtime tree.")
|
||||
return 0
|
||||
|
||||
rows = [[entry["relative_path"], str(entry["size_bytes"]), str(entry["mtime_epoch"])] for entry in entries]
|
||||
print_table(["path", "size_bytes", "mtime_epoch"], rows)
|
||||
return 0
|
||||
|
||||
|
||||
def print_incidents(limit: int) -> int:
|
||||
if not INCIDENT_ROOT.exists():
|
||||
print(f"No incident directory: {INCIDENT_ROOT}")
|
||||
return 0
|
||||
|
||||
bundles = sorted((path for path in INCIDENT_ROOT.iterdir() if path.is_dir()), reverse=True)[:limit]
|
||||
if not bundles:
|
||||
print(f"No incident bundles in {INCIDENT_ROOT}")
|
||||
return 0
|
||||
|
||||
rows = [[bundle.name, str(bundle)] for bundle in bundles]
|
||||
print_table(["bundle", "path"], rows)
|
||||
return 0
|
||||
|
||||
|
||||
def run_unit_action(action: str, target: str) -> int:
|
||||
units = resolve_target_units(target)
|
||||
run(["systemctl", action, *units], require_root=True)
|
||||
@@ -260,6 +318,17 @@ def run_healthcheck() -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
|
||||
if not INCIDENT_COLLECTOR_PATH.exists():
|
||||
raise SystemExit(f"Missing incident collector: {INCIDENT_COLLECTOR_PATH}")
|
||||
|
||||
command = [str(INCIDENT_COLLECTOR_PATH), "--tag", tag, "--since", since]
|
||||
if include_cores:
|
||||
command.append("--include-cores")
|
||||
run(command, require_root=True)
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
@@ -271,10 +340,16 @@ def main() -> int:
|
||||
return print_status(args.target)
|
||||
if args.command == "ports":
|
||||
return print_ports(args.live)
|
||||
if args.command == "cores":
|
||||
return print_cores(args.json)
|
||||
if args.command == "incidents":
|
||||
return print_incidents(args.limit)
|
||||
if args.command in {"start", "stop", "restart"}:
|
||||
return run_unit_action(args.command, args.target)
|
||||
if args.command == "logs":
|
||||
return run_logs(args.target, args.lines, args.follow)
|
||||
if args.command == "incident-collect":
|
||||
return run_incident_collect(args.tag, args.since, args.include_cores)
|
||||
if args.command == "healthcheck":
|
||||
return run_healthcheck()
|
||||
raise SystemExit(f"Unsupported command: {args.command}")
|
||||
|
||||
@@ -25,6 +25,7 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--systemd-dir", default="/etc/systemd/system", help="systemd unit destination")
|
||||
parser.add_argument("--libexec-dir", default="/usr/local/libexec", help="Helper script destination")
|
||||
parser.add_argument("--bin-dir", default="/usr/local/bin", help="Binary/script destination")
|
||||
parser.add_argument("--sbin-dir", default="/usr/local/sbin", help="Root-only binary/script destination")
|
||||
parser.add_argument("--env-file", default="/etc/metin/metin.env", help="Optional EnvironmentFile path for runtime overrides")
|
||||
parser.add_argument("--wait-host", default="127.0.0.1", help="DB readiness host")
|
||||
parser.add_argument("--wait-port", type=int, default=9000, help="DB readiness port")
|
||||
@@ -100,6 +101,7 @@ def main() -> int:
|
||||
systemd_dir = Path(args.systemd_dir)
|
||||
libexec_dir = Path(args.libexec_dir)
|
||||
bin_dir = Path(args.bin_dir)
|
||||
sbin_dir = Path(args.sbin_dir)
|
||||
|
||||
selected_channels = resolve_channels(args)
|
||||
instances = resolve_instances(selected_channels)
|
||||
@@ -141,6 +143,11 @@ def main() -> int:
|
||||
render_template(BIN_DIR / "metinctl.in", template_values),
|
||||
0o755,
|
||||
)
|
||||
write_text(
|
||||
sbin_dir / "metin-collect-incident",
|
||||
render_template(BIN_DIR / "metin-collect-incident.in", template_values),
|
||||
0o700,
|
||||
)
|
||||
|
||||
verify_units = [str(systemd_dir / unit_name) for unit_name in unit_names]
|
||||
run(["systemd-analyze", "verify", *verify_units])
|
||||
|
||||
@@ -18,6 +18,7 @@ RestartSec=5
|
||||
KillSignal=SIGTERM
|
||||
TimeoutStopSec=60
|
||||
LimitNOFILE=65535
|
||||
LimitCORE=infinity
|
||||
|
||||
[Install]
|
||||
RequiredBy=metin-server.service
|
||||
|
||||
@@ -18,6 +18,7 @@ RestartSec=5
|
||||
KillSignal=SIGTERM
|
||||
TimeoutStopSec=180
|
||||
LimitNOFILE=65535
|
||||
LimitCORE=infinity
|
||||
|
||||
[Install]
|
||||
RequiredBy=metin-server.service
|
||||
|
||||
@@ -18,6 +18,7 @@ RestartSec=5
|
||||
KillSignal=SIGTERM
|
||||
TimeoutStopSec=60
|
||||
LimitNOFILE=65535
|
||||
LimitCORE=infinity
|
||||
|
||||
[Install]
|
||||
RequiredBy=metin-server.service
|
||||
|
||||
@@ -38,6 +38,8 @@ The Debian deployment installs:
|
||||
- listing declared ports
|
||||
- restarting the whole stack or specific channels/instances
|
||||
- viewing logs
|
||||
- listing core files in the runtime tree
|
||||
- collecting incident bundles
|
||||
- running the root-only headless healthcheck
|
||||
|
||||
## Examples
|
||||
@@ -84,6 +86,24 @@ Run the end-to-end healthcheck:
|
||||
metinctl healthcheck
|
||||
```
|
||||
|
||||
List core files currently present in the runtime tree:
|
||||
|
||||
```bash
|
||||
metinctl cores
|
||||
```
|
||||
|
||||
Collect an incident bundle with logs, unit status, port state and repository revisions:
|
||||
|
||||
```bash
|
||||
metinctl incident-collect --tag auth-timeout --since "-20 minutes"
|
||||
```
|
||||
|
||||
List the most recent incident bundles:
|
||||
|
||||
```bash
|
||||
metinctl incidents
|
||||
```
|
||||
|
||||
## systemd installer behavior
|
||||
|
||||
`deploy/systemd/install_systemd.py` now uses the same inventory and installs `metinctl`.
|
||||
@@ -95,3 +115,26 @@ It also reconciles enabled game instance units against the selected channels:
|
||||
- if `--restart` is passed, stale game units are disabled with `--now`
|
||||
|
||||
This makes channel enablement declarative instead of depending on whatever happened to be enabled previously.
|
||||
|
||||
## Crash / Incident Pipeline
|
||||
|
||||
The Debian deployment now also installs:
|
||||
|
||||
- `/usr/local/sbin/metin-collect-incident`
|
||||
|
||||
The collector creates a timestamped bundle under:
|
||||
|
||||
- `/var/lib/metin/incidents`
|
||||
|
||||
Each bundle contains:
|
||||
|
||||
- repo revisions for `m2dev-server` and `m2dev-server-src`
|
||||
- `systemctl status` for the whole stack
|
||||
- recent `journalctl` output per unit
|
||||
- listener state from `ss -ltnp`
|
||||
- tailed runtime `syslog.log` and `syserr.log` files
|
||||
- metadata for any `core*` files found under `runtime/server/channels`
|
||||
|
||||
If you call it with `--include-cores`, matching core files are copied into the bundle as well.
|
||||
|
||||
The runtime units now also declare `LimitCORE=infinity`, so after the next service restart the processes are allowed to emit core dumps when the host kernel/core policy permits it.
|
||||
|
||||
Reference in New Issue
Block a user