ops: add summary and auth failure views

This commit is contained in:
server
2026-04-14 16:00:14 +02:00
parent 4fccf13e09
commit 825cfbc19b
2 changed files with 349 additions and 0 deletions

View File

@@ -2,8 +2,12 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import collections
import datetime as dt
import json import json
import os import os
import pwd
import re
import shutil import shutil
import subprocess import subprocess
import sys import sys
@@ -15,12 +19,24 @@ RUNTIME_ROOT = Path("{{RUNTIME_ROOT}}")
HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck") HEALTHCHECK_PATH = Path("/usr/local/sbin/metin-login-healthcheck")
INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident") INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
INCIDENT_ROOT = Path("/var/lib/metin/incidents") INCIDENT_ROOT = Path("/var/lib/metin/incidents")
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
sys.path.insert(0, str(REPO_ROOT)) sys.path.insert(0, str(REPO_ROOT))
import channel_inventory import channel_inventory
AUTH_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P<message>.*)$")
AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P<ip>[^]]+)\].* ptr (?P<desc>0x[0-9a-fA-F]+)")
AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P<login>[^(]+)\(\d+\) desc (?P<desc>0x[0-9a-fA-F]+)")
AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P<login>[^)]+)\) desc (?P<desc>0x[0-9a-fA-F]+)")
AUTH_START_RE = re.compile(r"(?:AUTH_LOGIN_DIRECT|QID_AUTH_LOGIN): START \d+ (?P<desc>0x[0-9a-fA-F]+)")
AUTH_SUCCESS_DIRECT_RE = re.compile(r"AUTH_LOGIN_DIRECT: SUCCESS (?P<login>.+)$")
AUTH_FAILURE_RE = re.compile(r"^(NOID|WRONGPWD|NOTAVAIL|ALREADY)$")
SMOKE_LOGIN_PREFIXES = ("smk", "smkhc", "smkdel", "smkfull", "smkneg", "smoke_", "csmk")
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Operational CLI for the Debian Metin runtime") parser = argparse.ArgumentParser(description="Operational CLI for the Debian Metin runtime")
subparsers = parser.add_subparsers(dest="command", required=True) subparsers = parser.add_subparsers(dest="command", required=True)
@@ -30,6 +46,11 @@ def parse_args() -> argparse.Namespace:
subparsers.add_parser("units", help="List managed systemd units") subparsers.add_parser("units", help="List managed systemd units")
summary_parser = subparsers.add_parser("summary", help="Show an operational summary")
summary_parser.add_argument("--hours", type=int, default=24, help="Auth activity window in hours")
summary_parser.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins in auth summary")
summary_parser.add_argument("--json", action="store_true", help="Print raw JSON")
status_parser = subparsers.add_parser("status", help="Show current unit state") status_parser = subparsers.add_parser("status", help="Show current unit state")
status_parser.add_argument("target", nargs="?", default="all", help="stack, db, auth, game, channel:<id>, instance:<name>") status_parser.add_argument("target", nargs="?", default="all", help="stack, db, auth, game, channel:<id>, instance:<name>")
@@ -56,6 +77,12 @@ def parse_args() -> argparse.Namespace:
incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value") incident_collect.add_argument("--since", default="-30 minutes", help="journalctl --since value")
incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle") incident_collect.add_argument("--include-cores", action="store_true", help="Copy matching core files into the bundle")
auth_failures = subparsers.add_parser("auth-failures", help="Show recent auth failures from auth syslog")
auth_failures.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
auth_failures.add_argument("--limit", type=int, default=20, help="Maximum failures to show")
auth_failures.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
auth_failures.add_argument("--json", action="store_true", help="Print raw JSON")
wait_ready = subparsers.add_parser("wait-ready", help="Wait until the runtime passes the login-ready probe") wait_ready = subparsers.add_parser("wait-ready", help="Wait until the runtime passes the login-ready probe")
wait_ready.add_argument("--timeout", type=int, default=120, help="Maximum seconds to wait") wait_ready.add_argument("--timeout", type=int, default=120, help="Maximum seconds to wait")
wait_ready.add_argument("--interval", type=float, default=5.0, help="Seconds between healthcheck attempts") wait_ready.add_argument("--interval", type=float, default=5.0, help="Seconds between healthcheck attempts")
@@ -80,6 +107,22 @@ def run(command: list[str], require_root: bool = False, capture_output: bool = F
) )
def run_as_repo_owner(command: list[str], repo_path: Path, capture_output: bool = False, check: bool = True) -> subprocess.CompletedProcess[str]:
effective_command = list(command)
if repo_path.exists():
owner_uid = repo_path.stat().st_uid
if os.geteuid() == 0 and owner_uid != 0:
owner_name = pwd.getpwuid(owner_uid).pw_name
effective_command = ["sudo", "-u", owner_name, *effective_command]
return subprocess.run(
effective_command,
check=check,
capture_output=capture_output,
text=True,
)
def get_unit_state(unit: str) -> tuple[str, str, str]: def get_unit_state(unit: str) -> tuple[str, str, str]:
active = run(["systemctl", "is-active", unit], capture_output=True, check=False).stdout.strip() or "unknown" active = run(["systemctl", "is-active", unit], capture_output=True, check=False).stdout.strip() or "unknown"
enabled = run(["systemctl", "is-enabled", unit], capture_output=True, check=False).stdout.strip() or "unknown" enabled = run(["systemctl", "is-enabled", unit], capture_output=True, check=False).stdout.strip() or "unknown"
@@ -144,6 +187,144 @@ def iter_core_files() -> list[Path]:
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()] return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
def count_incident_bundles() -> int:
if not INCIDENT_ROOT.exists():
return 0
return sum(1 for path in INCIDENT_ROOT.iterdir() if path.is_dir())
def git_summary(repo_path: Path) -> dict[str, object]:
summary: dict[str, object] = {"path": str(repo_path), "present": repo_path.exists()}
if not repo_path.exists():
return summary
head = run_as_repo_owner(["git", "-C", str(repo_path), "rev-parse", "--short", "HEAD"], repo_path, capture_output=True, check=False)
branch = run_as_repo_owner(["git", "-C", str(repo_path), "rev-parse", "--abbrev-ref", "HEAD"], repo_path, capture_output=True, check=False)
status = run_as_repo_owner(["git", "-C", str(repo_path), "status", "--short"], repo_path, capture_output=True, check=False)
summary.update(
{
"head": head.stdout.strip() or "unknown",
"branch": branch.stdout.strip() or "unknown",
"dirty": bool(status.stdout.strip()),
"status_count": len([line for line in status.stdout.splitlines() if line.strip()]),
}
)
return summary
def is_smoke_login(login: str) -> bool:
lowered = login.lower()
return lowered.startswith(SMOKE_LOGIN_PREFIXES)
def parse_auth_timestamp(value: str) -> dt.datetime:
return dt.datetime.strptime(value, "%Y-%m-%d %H:%M:%S.%f")
def load_auth_activity(hours: int) -> list[dict[str, object]]:
if not AUTH_SYSLOG_PATH.exists():
return []
cutoff = dt.datetime.now() - dt.timedelta(hours=hours)
desc_ips: dict[str, str] = {}
pending_by_desc: dict[str, dict[str, object]] = {}
active_desc: str | None = None
events: list[dict[str, object]] = []
with AUTH_SYSLOG_PATH.open("r", encoding="utf-8", errors="replace") as handle:
for raw_line in handle:
line = raw_line.replace("\x00", "").rstrip("\n")
match = AUTH_LOG_LINE_RE.match(line)
if not match:
continue
timestamp = parse_auth_timestamp(match.group("timestamp"))
if timestamp < cutoff:
continue
message = match.group("message").strip()
conn_match = AUTH_CONN_RE.search(message)
if conn_match:
desc_ips[conn_match.group("desc")] = conn_match.group("ip")
continue
invalid_login_match = AUTH_INVALID_LOGIN_RE.match(message)
if invalid_login_match:
login = invalid_login_match.group("login")
events.append(
{
"time": timestamp,
"login": login,
"ip": desc_ips.get(invalid_login_match.group("desc"), "-"),
"status": "failure",
"reason": "INVALID_LOGIN_STRING",
"smoke": is_smoke_login(login),
}
)
continue
login_match = AUTH_LOGIN_RE.match(message)
if login_match and " key " not in message:
login = login_match.group("login").strip()
desc = login_match.group("desc")
pending_by_desc[desc] = {
"time": timestamp,
"login": login,
"ip": desc_ips.get(desc, "-"),
"smoke": is_smoke_login(login),
}
continue
start_match = AUTH_START_RE.match(message)
if start_match:
active_desc = start_match.group("desc")
continue
failure_match = AUTH_FAILURE_RE.match(message)
if failure_match and active_desc and active_desc in pending_by_desc:
entry = pending_by_desc.pop(active_desc)
entry.update({"status": "failure", "reason": failure_match.group(1)})
events.append(entry)
active_desc = None
continue
success_direct_match = AUTH_SUCCESS_DIRECT_RE.match(message)
if success_direct_match and active_desc and active_desc in pending_by_desc:
entry = pending_by_desc.pop(active_desc)
entry.update({"status": "success", "reason": "SUCCESS"})
events.append(entry)
active_desc = None
continue
if message.startswith("QID_AUTH_LOGIN: SUCCESS") and active_desc and active_desc in pending_by_desc:
entry = pending_by_desc.pop(active_desc)
entry.update({"status": "success", "reason": "SUCCESS"})
events.append(entry)
active_desc = None
return events
def summarize_auth_activity(hours: int, include_smoke: bool) -> dict[str, object]:
events = load_auth_activity(hours)
filtered = [event for event in events if include_smoke or not event["smoke"]]
successes = [event for event in filtered if event["status"] == "success"]
failures = [event for event in filtered if event["status"] == "failure"]
reasons = collections.Counter(str(event["reason"]) for event in failures)
return {
"window_hours": hours,
"include_smoke": include_smoke,
"success_count": len(successes),
"failure_count": len(failures),
"failure_reasons": dict(reasons),
"latest_success": successes[-1] if successes else None,
"latest_failure": failures[-1] if failures else None,
}
def live_ports() -> set[int]: def live_ports() -> set[int]:
if shutil.which("ss") is None: if shutil.which("ss") is None:
return set() return set()
@@ -201,6 +382,107 @@ def print_units() -> int:
return 0 return 0
def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
units = [
channel_inventory.STACK_UNIT,
channel_inventory.DB_UNIT,
channel_inventory.DB_READY_UNIT,
channel_inventory.AUTH_UNIT,
*channel_inventory.get_game_units(),
]
unit_rows = []
for unit in units:
active, sub_state, enabled = get_unit_state(unit)
unit_rows.append({"unit": unit, "active": active, "sub": sub_state, "enabled": enabled})
game_units = [row for row in unit_rows if row["unit"].startswith("metin-game@")]
enabled_game_units = [row for row in game_units if row["enabled"] == "enabled"]
game_active = sum(1 for row in enabled_game_units if row["active"] == "active")
listening = live_ports()
port_rows = iter_port_rows()
auth_summary = summarize_auth_activity(hours, include_smoke)
repos = {
"m2dev-server": git_summary(REPO_ROOT),
"m2dev-server-src": git_summary(SOURCE_REPO_ROOT),
}
incident_count = count_incident_bundles()
core_count = len(iter_core_files())
payload = {
"repos": repos,
"units": unit_rows,
"game_active": game_active,
"game_enabled": len(enabled_game_units),
"game_declared": len(game_units),
"ports": [
{
**row,
"live": int(row["port"]) in listening,
}
for row in port_rows
],
"auth": auth_summary,
"core_count": core_count,
"incident_count": incident_count,
}
if as_json:
print(json.dumps(payload, indent=2, default=str))
return 0
repo_rows = []
for name, summary in repos.items():
repo_rows.append(
[
name,
str(summary.get("branch", "unknown")),
str(summary.get("head", "unknown")),
"yes" if summary.get("dirty") else "no",
]
)
public_ports = [row for row in payload["ports"] if row["visibility"] == "public"]
public_port_rows = [
[row["name"], row["port"], row["p2p_port"], "yes" if row["live"] else "no"]
for row in public_ports
]
print("Repos")
print_table(["repo", "branch", "head", "dirty"], repo_rows)
print()
print("Runtime")
print_table(["unit", "active", "sub", "enabled"], [[row["unit"], row["active"], row["sub"], row["enabled"]] for row in unit_rows[:4]])
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
print(f"core files: {core_count}")
print(f"incident bundles: {incident_count}")
print()
print("Public Ports")
print_table(["name", "port", "p2p", "live"], public_port_rows)
print()
print(f"Auth ({hours}h)")
print(f"successes: {auth_summary['success_count']}")
print(f"failures: {auth_summary['failure_count']}")
if auth_summary["failure_reasons"]:
reason_line = ", ".join(f"{reason}={count}" for reason, count in sorted(auth_summary["failure_reasons"].items()))
print(f"failure reasons: {reason_line}")
latest_success = auth_summary["latest_success"]
if latest_success:
print(
f"latest success: {latest_success['time'].strftime('%Y-%m-%d %H:%M:%S')} "
f"{latest_success['login']} from {latest_success['ip']}"
)
latest_failure = auth_summary["latest_failure"]
if latest_failure:
print(
f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} "
f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}"
)
return 0
def resolve_target_units(target: str) -> list[str]: def resolve_target_units(target: str) -> list[str]:
normalized = target.strip().lower() normalized = target.strip().lower()
@@ -317,6 +599,49 @@ def run_logs(target: str, lines: int, follow: bool) -> int:
return 0 return 0
def print_auth_failures(hours: int, limit: int, include_smoke: bool, as_json: bool) -> int:
events = load_auth_activity(hours)
failures = [event for event in events if event["status"] == "failure" and (include_smoke or not event["smoke"])]
failures = failures[-limit:]
reason_counts = collections.Counter(str(event["reason"]) for event in failures)
payload = {
"window_hours": hours,
"limit": limit,
"include_smoke": include_smoke,
"count": len(failures),
"reasons": dict(reason_counts),
"entries": [
{
"time": event["time"].strftime("%Y-%m-%d %H:%M:%S"),
"login": event["login"],
"ip": event["ip"],
"reason": event["reason"],
}
for event in failures
],
}
if as_json:
print(json.dumps(payload, indent=2))
return 0
if not failures:
print(f"No auth failures in the last {hours}h.")
return 0
if reason_counts:
print(", ".join(f"{reason}={count}" for reason, count in sorted(reason_counts.items())))
print()
rows = [
[event["time"].strftime("%Y-%m-%d %H:%M:%S"), str(event["login"]), str(event["ip"]), str(event["reason"])]
for event in failures
]
print_table(["time", "login", "ip", "reason"], rows)
return 0
def run_healthcheck(mode: str) -> int: def run_healthcheck(mode: str) -> int:
if not HEALTHCHECK_PATH.exists(): if not HEALTHCHECK_PATH.exists():
raise SystemExit(f"Missing healthcheck wrapper: {HEALTHCHECK_PATH}") raise SystemExit(f"Missing healthcheck wrapper: {HEALTHCHECK_PATH}")
@@ -369,6 +694,8 @@ def main() -> int:
return print_inventory(args.json) return print_inventory(args.json)
if args.command == "units": if args.command == "units":
return print_units() return print_units()
if args.command == "summary":
return print_summary(args.hours, args.include_smoke, args.json)
if args.command == "status": if args.command == "status":
return print_status(args.target) return print_status(args.target)
if args.command == "ports": if args.command == "ports":
@@ -377,6 +704,8 @@ def main() -> int:
return print_cores(args.json) return print_cores(args.json)
if args.command == "incidents": if args.command == "incidents":
return print_incidents(args.limit) return print_incidents(args.limit)
if args.command == "auth-failures":
return print_auth_failures(args.hours, args.limit, args.include_smoke, args.json)
if args.command in {"start", "stop", "restart"}: if args.command in {"start", "stop", "restart"}:
return run_unit_action(args.command, args.target) return run_unit_action(args.command, args.target)
if args.command == "logs": if args.command == "logs":

View File

@@ -32,10 +32,12 @@ The Debian deployment installs:
`metinctl` is a lightweight operational CLI for: `metinctl` is a lightweight operational CLI for:
- showing an operational summary
- viewing inventory - viewing inventory
- listing managed units - listing managed units
- checking service status - checking service status
- listing declared ports - listing declared ports
- listing recent auth failures
- restarting the whole stack or specific channels/instances - restarting the whole stack or specific channels/instances
- viewing logs - viewing logs
- listing core files in the runtime tree - listing core files in the runtime tree
@@ -57,12 +59,30 @@ Show current unit state:
metinctl status metinctl status
``` ```
Show a quick operational summary:
```bash
metinctl summary
```
Show declared ports and whether they are currently listening: Show declared ports and whether they are currently listening:
```bash ```bash
metinctl ports --live metinctl ports --live
``` ```
Show recent real auth failures and skip smoke-test logins:
```bash
metinctl auth-failures
```
Include smoke-test failures too:
```bash
metinctl auth-failures --include-smoke
```
Restart only channel 1 cores: Restart only channel 1 cores:
```bash ```bash