ops: add incident collection pipeline

This commit is contained in:
server
2026-04-14 13:21:02 +02:00
parent 78518daed0
commit 1a9a643335
8 changed files with 322 additions and 0 deletions

View File

@@ -34,11 +34,14 @@ The channel selection and port layout now come from the versioned inventory file
- `/usr/local/libexec/metin-game-instance-start`
- `/usr/local/libexec/metin-wait-port`
- `/usr/local/bin/metinctl`
- `/usr/local/sbin/metin-collect-incident`
The `metin-db-ready.service` gate waits until the DB socket is actually accepting connections before `auth` and `game` units start.
The installer also reconciles enabled `metin-game@...` instances against the selected channel set so stale units do not stay enabled forever.
The runtime unit templates now also set `LimitCORE=infinity` for `db`, `auth`, and `game` services.
## Optional Environment File
The runtime units support an optional `EnvironmentFile` for host-local overrides:

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import shutil
import socket
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
REPO_ROOT = Path("{{REPO_ROOT}}")
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
INCIDENT_ROOT_DEFAULT = Path("/var/lib/metin/incidents")
sys.path.insert(0, str(REPO_ROOT))
import channel_inventory
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Collect a Metin runtime incident bundle")
parser.add_argument("--tag", default="manual", help="Short incident tag used in the bundle directory name")
parser.add_argument("--since", default="-30 minutes", help="journalctl --since value")
parser.add_argument("--output-root", default=str(INCIDENT_ROOT_DEFAULT), help="Incident bundle root directory")
parser.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
return parser.parse_args()
def ensure_root() -> None:
if os.geteuid() != 0:
raise SystemExit("Run as root.")
def sanitize_tag(value: str) -> str:
filtered = "".join(char if char.isalnum() or char in {"-", "_"} else "-" for char in value.strip())
return filtered or "manual"
def run(command: list[str], check: bool = True) -> subprocess.CompletedProcess[str]:
return subprocess.run(command, check=check, capture_output=True, text=True)
def write_text(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def write_command_output(bundle_dir: Path, filename: str, command: list[str], check: bool = False) -> None:
completed = run(command, check=check)
content = f"$ {' '.join(command)}\n\n"
if completed.stdout:
content += completed.stdout
if completed.stderr:
content += "\n[stderr]\n" + completed.stderr
write_text(bundle_dir / filename, content)
def copy_log_tails(bundle_dir: Path) -> None:
logs_dir = bundle_dir / "logs"
for path in sorted(RUNTIME_ROOT.glob("channels/**/syslog.log")) + sorted(RUNTIME_ROOT.glob("channels/**/syserr.log")):
if not path.is_file():
continue
relative = path.relative_to(RUNTIME_ROOT)
destination = logs_dir / relative
destination.parent.mkdir(parents=True, exist_ok=True)
completed = run(["tail", "-n", "400", str(path)], check=False)
content = f"# tail -n 400 {path}\n\n"
if completed.stdout:
content += completed.stdout
if completed.stderr:
content += "\n[stderr]\n" + completed.stderr
destination.write_text(content, encoding="utf-8")
def find_core_files() -> list[Path]:
matches: list[Path] = []
for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")):
if path.is_file():
matches.append(path)
return matches
def write_core_metadata(bundle_dir: Path, core_files: list[Path]) -> None:
rows = []
for path in core_files:
stat = path.stat()
rows.append(
{
"path": str(path),
"size_bytes": stat.st_size,
"mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
}
)
write_text(bundle_dir / "core-files.json", json.dumps(rows, indent=2))
def copy_core_files(bundle_dir: Path, core_files: list[Path]) -> None:
cores_dir = bundle_dir / "cores"
for path in core_files:
relative = path.relative_to(RUNTIME_ROOT)
destination = cores_dir / relative
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(path, destination)
def git_summary(repo_path: Path) -> dict[str, object]:
summary: dict[str, object] = {"path": str(repo_path), "present": repo_path.exists()}
if not repo_path.exists():
return summary
head = run(["git", "-C", str(repo_path), "rev-parse", "HEAD"], check=False)
status = run(["git", "-C", str(repo_path), "status", "--short"], check=False)
branch = run(["git", "-C", str(repo_path), "rev-parse", "--abbrev-ref", "HEAD"], check=False)
summary.update(
{
"head": head.stdout.strip(),
"branch": branch.stdout.strip(),
"dirty": bool(status.stdout.strip()),
"status": status.stdout.splitlines(),
}
)
return summary
def main() -> int:
args = parse_args()
ensure_root()
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
tag = sanitize_tag(args.tag)
output_root = Path(args.output_root)
bundle_dir = output_root / f"{timestamp}-{tag}"
bundle_dir.mkdir(parents=True, exist_ok=False)
os.chmod(bundle_dir, 0o700)
units = [
channel_inventory.STACK_UNIT,
channel_inventory.DB_UNIT,
channel_inventory.DB_READY_UNIT,
channel_inventory.AUTH_UNIT,
*channel_inventory.get_game_units(),
]
source_repo = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
runtime_repo = REPO_ROOT
meta = {
"created_at": datetime.now(timezone.utc).isoformat(),
"hostname": socket.gethostname(),
"runtime_root": str(RUNTIME_ROOT),
"output_root": str(output_root),
"tag": tag,
"since": args.since,
"repos": {
"m2dev-server": git_summary(runtime_repo),
"m2dev-server-src": git_summary(source_repo),
},
}
write_text(bundle_dir / "meta.json", json.dumps(meta, indent=2))
write_command_output(bundle_dir, "uname.txt", ["uname", "-a"])
write_command_output(bundle_dir, "df.txt", ["df", "-h"])
write_command_output(bundle_dir, "free.txt", ["free", "-h"], check=False)
write_command_output(bundle_dir, "ports.txt", ["ss", "-ltnp"], check=False)
write_command_output(bundle_dir, "systemctl-status.txt", ["systemctl", "status", "--no-pager", *units], check=False)
journal_dir = bundle_dir / "journal"
for unit in units:
safe_name = unit.replace("@", "_").replace(".", "_")
write_command_output(
journal_dir,
f"{safe_name}.log",
["journalctl", "--no-pager", "--since", args.since, "-u", unit],
check=False,
)
copy_log_tails(bundle_dir)
core_files = find_core_files()
write_core_metadata(bundle_dir, core_files)
if args.include_cores and core_files:
copy_core_files(bundle_dir, core_files)
print(bundle_dir)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -12,6 +12,8 @@ from pathlib import Path
REPO_ROOT = Path("{{REPO_ROOT}}")
RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
sys.path.insert(0, str(REPO_ROOT))
@@ -42,6 +44,17 @@ def parse_args() -> argparse.Namespace:
logs_parser.add_argument("-n", "--lines", type=int, default=100, help="Number of journal lines")
logs_parser.add_argument("-f", "--follow", action="store_true", help="Follow the journal")
cores_parser = subparsers.add_parser("cores", help="List core files under the runtime tree")
cores_parser.add_argument("--json", action="store_true", help="Print raw JSON")
incidents_parser = subparsers.add_parser("incidents", help="List collected incident bundles")
incidents_parser.add_argument("--limit", type=int, default=10, help="Maximum number of bundles to show")
incident_collect = subparsers.add_parser("incident-collect", help="Collect an incident bundle")
incident_collect.add_argument("--tag", default="manual", help="Short incident tag")
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
subparsers.add_parser("healthcheck", help="Run the root-only headless healthcheck")
return parser.parse_args()
@@ -121,6 +134,10 @@ def iter_port_rows() -> list[dict[str, str]]:
return rows
def iter_core_files() -> list[Path]:
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
def live_ports() -> set[int]:
if shutil.which("ss") is None:
return set()
@@ -236,6 +253,47 @@ def print_ports(show_live: bool) -> int:
return 0
def print_cores(as_json: bool) -> int:
entries = []
for path in iter_core_files():
stat = path.stat()
entries.append(
{
"path": str(path),
"relative_path": str(path.relative_to(RUNTIME_ROOT)),
"size_bytes": stat.st_size,
"mtime_epoch": int(stat.st_mtime),
}
)
if as_json:
print(json.dumps(entries, indent=2))
return 0
if not entries:
print("No core files found under the runtime tree.")
return 0
rows = [[entry["relative_path"], str(entry["size_bytes"]), str(entry["mtime_epoch"])] for entry in entries]
print_table(["path", "size_bytes", "mtime_epoch"], rows)
return 0
def print_incidents(limit: int) -> int:
if not INCIDENT_ROOT.exists():
print(f"No incident directory: {INCIDENT_ROOT}")
return 0
bundles = sorted((path for path in INCIDENT_ROOT.iterdir() if path.is_dir()), reverse=True)[:limit]
if not bundles:
print(f"No incident bundles in {INCIDENT_ROOT}")
return 0
rows = [[bundle.name, str(bundle)] for bundle in bundles]
print_table(["bundle", "path"], rows)
return 0
def run_unit_action(action: str, target: str) -> int:
units = resolve_target_units(target)
run(["systemctl", action, *units], require_root=True)
@@ -260,6 +318,17 @@ def run_healthcheck() -> int:
return 0
def run_incident_collect(tag: str, since: str, include_cores: bool) -> int:
if not INCIDENT_COLLECTOR_PATH.exists():
raise SystemExit(f"Missing incident collector: {INCIDENT_COLLECTOR_PATH}")
command = [str(INCIDENT_COLLECTOR_PATH), "--tag", tag, "--since", since]
if include_cores:
command.append("--include-cores")
run(command, require_root=True)
return 0
def main() -> int:
args = parse_args()
@@ -271,10 +340,16 @@ def main() -> int:
return print_status(args.target)
if args.command == "ports":
return print_ports(args.live)
if args.command == "cores":
return print_cores(args.json)
if args.command == "incidents":
return print_incidents(args.limit)
if args.command in {"start", "stop", "restart"}:
return run_unit_action(args.command, args.target)
if args.command == "logs":
return run_logs(args.target, args.lines, args.follow)
if args.command == "incident-collect":
return run_incident_collect(args.tag, args.since, args.include_cores)
if args.command == "healthcheck":
return run_healthcheck()
raise SystemExit(f"Unsupported command: {args.command}")

View File

@@ -25,6 +25,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--systemd-dir", default="/etc/systemd/system", help="systemd unit destination")
parser.add_argument("--libexec-dir", default="/usr/local/libexec", help="Helper script destination")
parser.add_argument("--bin-dir", default="/usr/local/bin", help="Binary/script destination")
parser.add_argument("--sbin-dir", default="/usr/local/sbin", help="Root-only binary/script destination")
parser.add_argument("--env-file", default="/etc/metin/metin.env", help="Optional EnvironmentFile path for runtime overrides")
parser.add_argument("--wait-host", default="127.0.0.1", help="DB readiness host")
parser.add_argument("--wait-port", type=int, default=9000, help="DB readiness port")
@@ -100,6 +101,7 @@ def main() -> int:
systemd_dir = Path(args.systemd_dir)
libexec_dir = Path(args.libexec_dir)
bin_dir = Path(args.bin_dir)
sbin_dir = Path(args.sbin_dir)
selected_channels = resolve_channels(args)
instances = resolve_instances(selected_channels)
@@ -141,6 +143,11 @@ def main() -> int:
render_template(BIN_DIR / "metinctl.in", template_values),
0o755,
)
write_text(
sbin_dir / "metin-collect-incident",
render_template(BIN_DIR / "metin-collect-incident.in", template_values),
0o700,
)
verify_units = [str(systemd_dir / unit_name) for unit_name in unit_names]
run(["systemd-analyze", "verify", *verify_units])

View File

@@ -18,6 +18,7 @@ RestartSec=5
KillSignal=SIGTERM
TimeoutStopSec=60
LimitNOFILE=65535
LimitCORE=infinity
[Install]
RequiredBy=metin-server.service

View File

@@ -18,6 +18,7 @@ RestartSec=5
KillSignal=SIGTERM
TimeoutStopSec=180
LimitNOFILE=65535
LimitCORE=infinity
[Install]
RequiredBy=metin-server.service

View File

@@ -18,6 +18,7 @@ RestartSec=5
KillSignal=SIGTERM
TimeoutStopSec=60
LimitNOFILE=65535
LimitCORE=infinity
[Install]
RequiredBy=metin-server.service