ops: add syserr triage views

This commit is contained in:
server
2026-04-14 16:18:02 +02:00
parent cd2e1d61ca
commit 84625652fe
2 changed files with 167 additions and 0 deletions

View File

@@ -21,6 +21,7 @@ INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
SYSERR_GLOB = "channels/**/syserr.log"
sys.path.insert(0, str(REPO_ROOT))
@@ -28,6 +29,7 @@ import channel_inventory
AUTH_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P<message>.*)$")
GENERIC_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[(?P<level>[^]]+)\] (?P<message>.*)$")
AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P<ip>[^]]+)\].* ptr (?P<desc>0x[0-9a-fA-F]+)")
AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P<login>[^(]+)\(\d+\) desc (?P<desc>0x[0-9a-fA-F]+)")
AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P<login>[^)]+)\) desc (?P<desc>0x[0-9a-fA-F]+)")
@@ -64,6 +66,16 @@ def parse_args() -> argparse.Namespace:
auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
auth_ips.add_argument("--json", action="store_true", help="Print raw JSON")
recent_errors = subparsers.add_parser("recent-errors", help="Show recent syserr entries across runtime components")
recent_errors.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
recent_errors.add_argument("--limit", type=int, default=30, help="Maximum errors to show")
recent_errors.add_argument("--json", action="store_true", help="Print raw JSON")
error_summary = subparsers.add_parser("error-summary", help="Summarize recurring syserr entries")
error_summary.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
error_summary.add_argument("--limit", type=int, default=20, help="Maximum grouped errors to show")
error_summary.add_argument("--json", action="store_true", help="Print raw JSON")
sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2")
sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
@@ -214,6 +226,10 @@ def iter_core_files() -> list[Path]:
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
def iter_syserr_files() -> list[Path]:
return sorted(path for path in RUNTIME_ROOT.glob(SYSERR_GLOB) if path.is_file())
def count_incident_bundles() -> int:
if not INCIDENT_ROOT.exists():
return 0
@@ -360,6 +376,37 @@ def filter_auth_events(hours: int, include_smoke: bool, status: str) -> list[dic
return filtered
def load_syserr_entries(hours: int) -> list[dict[str, object]]:
cutoff = dt.datetime.now() - dt.timedelta(hours=hours)
entries: list[dict[str, object]] = []
for path in iter_syserr_files():
last_entry: dict[str, object] | None = None
with path.open("r", encoding="utf-8", errors="replace") as handle:
for raw_line in handle:
line = raw_line.replace("\x00", "").rstrip("\n")
match = GENERIC_LOG_LINE_RE.match(line)
if match:
timestamp = parse_auth_timestamp(match.group("timestamp"))
if timestamp < cutoff:
last_entry = None
continue
last_entry = {
"time": timestamp,
"level": match.group("level"),
"source": str(path.relative_to(RUNTIME_ROOT)),
"message": match.group("message").strip(),
}
entries.append(last_entry)
continue
if last_entry is not None and line.strip():
last_entry["message"] = f"{last_entry['message']} | {line.strip()}"
return [entry for entry in entries if str(entry["level"]).lower() == "error"]
def run_mariadb_query(query: str) -> list[list[str]]:
completed = run(["mariadb", "-N", "-B", "-e", query], require_root=True, capture_output=True)
rows: list[list[str]] = []
@@ -560,6 +607,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
listening = live_ports()
port_rows = iter_port_rows()
auth_summary = summarize_auth_activity(hours, include_smoke)
recent_errors = load_syserr_entries(hours)
stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False)
stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True)
stale_orphan_count = max(stale_total_count - stale_session_count, 0)
@@ -584,6 +632,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
for row in port_rows
],
"auth": auth_summary,
"recent_error_count": len(recent_errors),
"latest_error": {
"time": recent_errors[-1]["time"].strftime("%Y-%m-%d %H:%M:%S"),
"source": recent_errors[-1]["source"],
"message": recent_errors[-1]["message"],
} if recent_errors else None,
"stale_open_sessions": {
"user_count": stale_session_count,
"orphan_count": stale_orphan_count,
@@ -623,6 +677,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
print(f"core files: {core_count}")
print(f"incident bundles: {incident_count}")
print(f"recent syserr errors ({hours}h): {len(recent_errors)}")
print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan")
print()
@@ -648,6 +703,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} "
f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}"
)
latest_error = payload["latest_error"]
if latest_error:
print(
f"latest error: {latest_error['time']} "
f"{latest_error['source']} {latest_error['message']}"
)
return 0
@@ -771,6 +832,94 @@ def print_auth_ips(hours: int, limit: int, include_smoke: bool, as_json: bool) -
return 0
def print_recent_errors(hours: int, limit: int, as_json: bool) -> int:
entries = load_syserr_entries(hours)[-limit:]
payload = {
"window_hours": hours,
"limit": limit,
"count": len(entries),
"entries": [
{
"time": entry["time"].strftime("%Y-%m-%d %H:%M:%S"),
"source": str(entry["source"]),
"message": str(entry["message"]),
}
for entry in entries
],
}
if as_json:
print(json.dumps(payload, indent=2))
return 0
if not entries:
print(f"No syserr entries in the last {hours}h.")
return 0
rows = [
[entry["time"].strftime("%Y-%m-%d %H:%M:%S"), str(entry["source"]), str(entry["message"])]
for entry in entries
]
print_table(["time", "source", "message"], rows)
return 0
def print_error_summary(hours: int, limit: int, as_json: bool) -> int:
entries = load_syserr_entries(hours)
grouped: dict[tuple[str, str], dict[str, object]] = {}
for entry in entries:
key = (str(entry["source"]), str(entry["message"]))
bucket = grouped.setdefault(
key,
{
"source": str(entry["source"]),
"message": str(entry["message"]),
"count": 0,
"last_seen": entry["time"],
},
)
bucket["count"] = int(bucket["count"]) + 1
if entry["time"] >= bucket["last_seen"]:
bucket["last_seen"] = entry["time"]
rows = sorted(
grouped.values(),
key=lambda item: (int(item["count"]), item["last_seen"]),
reverse=True,
)[:limit]
payload = {
"window_hours": hours,
"limit": limit,
"count": len(rows),
"entries": [
{
"source": str(row["source"]),
"count": int(row["count"]),
"last_seen": row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"),
"message": str(row["message"]),
}
for row in rows
],
}
if as_json:
print(json.dumps(payload, indent=2))
return 0
if not rows:
print(f"No syserr summary entries in the last {hours}h.")
return 0
table_rows = [
[str(row["count"]), row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"), str(row["source"]), str(row["message"])]
for row in rows
]
print_table(["count", "last_seen", "source", "message"], table_rows)
return 0
def resolve_target_units(target: str) -> list[str]:
normalized = target.strip().lower()
@@ -1058,6 +1207,10 @@ def main() -> int:
return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json)
if args.command == "auth-ips":
return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json)
if args.command == "recent-errors":
return print_recent_errors(args.hours, args.limit, args.json)
if args.command == "error-summary":
return print_error_summary(args.hours, args.limit, args.json)
if args.command == "status":
return print_status(args.target)
if args.command == "ports":

View File

@@ -35,6 +35,8 @@ The Debian deployment installs:
- showing an operational summary
- showing recent auth success/failure activity
- showing auth activity grouped by source IP
- showing recent `syserr.log` entries
- summarizing recurring `syserr.log` entries
- viewing inventory
- listing managed units
- checking service status
@@ -99,6 +101,18 @@ Show auth activity grouped by IP:
metinctl auth-ips
```
Show the latest runtime errors collected from all `syserr.log` files:
```bash
metinctl recent-errors
```
Show the most repeated runtime errors in the last 24 hours:
```bash
metinctl error-summary
```
Include smoke-test failures too:
```bash