diff --git a/deploy/systemd/bin/metinctl.in b/deploy/systemd/bin/metinctl.in index f7f1935..0628814 100644 --- a/deploy/systemd/bin/metinctl.in +++ b/deploy/systemd/bin/metinctl.in @@ -21,6 +21,7 @@ INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident") INCIDENT_ROOT = Path("/var/lib/metin/incidents") AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log" SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src" +SYSERR_GLOB = "channels/**/syserr.log" sys.path.insert(0, str(REPO_ROOT)) @@ -28,6 +29,7 @@ import channel_inventory AUTH_LOG_LINE_RE = re.compile(r"^\[(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P.*)$") +GENERIC_LOG_LINE_RE = re.compile(r"^\[(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[(?P[^]]+)\] (?P.*)$") AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P[^]]+)\].* ptr (?P0x[0-9a-fA-F]+)") AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P[^(]+)\(\d+\) desc (?P0x[0-9a-fA-F]+)") AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P[^)]+)\) desc (?P0x[0-9a-fA-F]+)") @@ -64,6 +66,16 @@ def parse_args() -> argparse.Namespace: auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins") auth_ips.add_argument("--json", action="store_true", help="Print raw JSON") + recent_errors = subparsers.add_parser("recent-errors", help="Show recent syserr entries across runtime components") + recent_errors.add_argument("--hours", type=int, default=24, help="How many hours back to inspect") + recent_errors.add_argument("--limit", type=int, default=30, help="Maximum errors to show") + recent_errors.add_argument("--json", action="store_true", help="Print raw JSON") + + error_summary = subparsers.add_parser("error-summary", help="Summarize recurring syserr entries") + error_summary.add_argument("--hours", type=int, default=24, help="How many hours back to inspect") + error_summary.add_argument("--limit", type=int, default=20, help="Maximum grouped errors to show") + error_summary.add_argument("--json", action="store_true", help="Print raw JSON") + sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2") sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect") sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show") @@ -214,6 +226,10 @@ def iter_core_files() -> list[Path]: return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()] +def iter_syserr_files() -> list[Path]: + return sorted(path for path in RUNTIME_ROOT.glob(SYSERR_GLOB) if path.is_file()) + + def count_incident_bundles() -> int: if not INCIDENT_ROOT.exists(): return 0 @@ -360,6 +376,37 @@ def filter_auth_events(hours: int, include_smoke: bool, status: str) -> list[dic return filtered +def load_syserr_entries(hours: int) -> list[dict[str, object]]: + cutoff = dt.datetime.now() - dt.timedelta(hours=hours) + entries: list[dict[str, object]] = [] + + for path in iter_syserr_files(): + last_entry: dict[str, object] | None = None + with path.open("r", encoding="utf-8", errors="replace") as handle: + for raw_line in handle: + line = raw_line.replace("\x00", "").rstrip("\n") + match = GENERIC_LOG_LINE_RE.match(line) + if match: + timestamp = parse_auth_timestamp(match.group("timestamp")) + if timestamp < cutoff: + last_entry = None + continue + + last_entry = { + "time": timestamp, + "level": match.group("level"), + "source": str(path.relative_to(RUNTIME_ROOT)), + "message": match.group("message").strip(), + } + entries.append(last_entry) + continue + + if last_entry is not None and line.strip(): + last_entry["message"] = f"{last_entry['message']} | {line.strip()}" + + return [entry for entry in entries if str(entry["level"]).lower() == "error"] + + def run_mariadb_query(query: str) -> list[list[str]]: completed = run(["mariadb", "-N", "-B", "-e", query], require_root=True, capture_output=True) rows: list[list[str]] = [] @@ -560,6 +607,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int: listening = live_ports() port_rows = iter_port_rows() auth_summary = summarize_auth_activity(hours, include_smoke) + recent_errors = load_syserr_entries(hours) stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False) stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True) stale_orphan_count = max(stale_total_count - stale_session_count, 0) @@ -584,6 +632,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int: for row in port_rows ], "auth": auth_summary, + "recent_error_count": len(recent_errors), + "latest_error": { + "time": recent_errors[-1]["time"].strftime("%Y-%m-%d %H:%M:%S"), + "source": recent_errors[-1]["source"], + "message": recent_errors[-1]["message"], + } if recent_errors else None, "stale_open_sessions": { "user_count": stale_session_count, "orphan_count": stale_orphan_count, @@ -623,6 +677,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int: print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)") print(f"core files: {core_count}") print(f"incident bundles: {incident_count}") + print(f"recent syserr errors ({hours}h): {len(recent_errors)}") print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan") print() @@ -648,6 +703,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int: f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} " f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}" ) + latest_error = payload["latest_error"] + if latest_error: + print( + f"latest error: {latest_error['time']} " + f"{latest_error['source']} {latest_error['message']}" + ) return 0 @@ -771,6 +832,94 @@ def print_auth_ips(hours: int, limit: int, include_smoke: bool, as_json: bool) - return 0 +def print_recent_errors(hours: int, limit: int, as_json: bool) -> int: + entries = load_syserr_entries(hours)[-limit:] + payload = { + "window_hours": hours, + "limit": limit, + "count": len(entries), + "entries": [ + { + "time": entry["time"].strftime("%Y-%m-%d %H:%M:%S"), + "source": str(entry["source"]), + "message": str(entry["message"]), + } + for entry in entries + ], + } + + if as_json: + print(json.dumps(payload, indent=2)) + return 0 + + if not entries: + print(f"No syserr entries in the last {hours}h.") + return 0 + + rows = [ + [entry["time"].strftime("%Y-%m-%d %H:%M:%S"), str(entry["source"]), str(entry["message"])] + for entry in entries + ] + print_table(["time", "source", "message"], rows) + return 0 + + +def print_error_summary(hours: int, limit: int, as_json: bool) -> int: + entries = load_syserr_entries(hours) + grouped: dict[tuple[str, str], dict[str, object]] = {} + + for entry in entries: + key = (str(entry["source"]), str(entry["message"])) + bucket = grouped.setdefault( + key, + { + "source": str(entry["source"]), + "message": str(entry["message"]), + "count": 0, + "last_seen": entry["time"], + }, + ) + bucket["count"] = int(bucket["count"]) + 1 + if entry["time"] >= bucket["last_seen"]: + bucket["last_seen"] = entry["time"] + + rows = sorted( + grouped.values(), + key=lambda item: (int(item["count"]), item["last_seen"]), + reverse=True, + )[:limit] + + payload = { + "window_hours": hours, + "limit": limit, + "count": len(rows), + "entries": [ + { + "source": str(row["source"]), + "count": int(row["count"]), + "last_seen": row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"), + "message": str(row["message"]), + } + for row in rows + ], + } + + if as_json: + print(json.dumps(payload, indent=2)) + return 0 + + if not rows: + print(f"No syserr summary entries in the last {hours}h.") + return 0 + + table_rows = [ + [str(row["count"]), row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"), str(row["source"]), str(row["message"])] + for row in rows + ] + print_table(["count", "last_seen", "source", "message"], table_rows) + return 0 + + def resolve_target_units(target: str) -> list[str]: normalized = target.strip().lower() @@ -1058,6 +1207,10 @@ def main() -> int: return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json) if args.command == "auth-ips": return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json) + if args.command == "recent-errors": + return print_recent_errors(args.hours, args.limit, args.json) + if args.command == "error-summary": + return print_error_summary(args.hours, args.limit, args.json) if args.command == "status": return print_status(args.target) if args.command == "ports": diff --git a/docs/server-management.md b/docs/server-management.md index 9d19fea..b23cb83 100644 --- a/docs/server-management.md +++ b/docs/server-management.md @@ -35,6 +35,8 @@ The Debian deployment installs: - showing an operational summary - showing recent auth success/failure activity - showing auth activity grouped by source IP +- showing recent `syserr.log` entries +- summarizing recurring `syserr.log` entries - viewing inventory - listing managed units - checking service status @@ -99,6 +101,18 @@ Show auth activity grouped by IP: metinctl auth-ips ``` +Show the latest runtime errors collected from all `syserr.log` files: + +```bash +metinctl recent-errors +``` + +Show the most repeated runtime errors in the last 24 hours: + +```bash +metinctl error-summary +``` + Include smoke-test failures too: ```bash