ops: add syserr triage views
This commit is contained in:
@@ -21,6 +21,7 @@ INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
|
||||
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
||||
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
|
||||
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
||||
SYSERR_GLOB = "channels/**/syserr.log"
|
||||
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
@@ -28,6 +29,7 @@ import channel_inventory
|
||||
|
||||
|
||||
AUTH_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P<message>.*)$")
|
||||
GENERIC_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[(?P<level>[^]]+)\] (?P<message>.*)$")
|
||||
AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P<ip>[^]]+)\].* ptr (?P<desc>0x[0-9a-fA-F]+)")
|
||||
AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P<login>[^(]+)\(\d+\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
||||
AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P<login>[^)]+)\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
||||
@@ -64,6 +66,16 @@ def parse_args() -> argparse.Namespace:
|
||||
auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
|
||||
auth_ips.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||
|
||||
recent_errors = subparsers.add_parser("recent-errors", help="Show recent syserr entries across runtime components")
|
||||
recent_errors.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||
recent_errors.add_argument("--limit", type=int, default=30, help="Maximum errors to show")
|
||||
recent_errors.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||
|
||||
error_summary = subparsers.add_parser("error-summary", help="Summarize recurring syserr entries")
|
||||
error_summary.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||
error_summary.add_argument("--limit", type=int, default=20, help="Maximum grouped errors to show")
|
||||
error_summary.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||
|
||||
sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2")
|
||||
sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||
sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
|
||||
@@ -214,6 +226,10 @@ def iter_core_files() -> list[Path]:
|
||||
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
|
||||
|
||||
|
||||
def iter_syserr_files() -> list[Path]:
|
||||
return sorted(path for path in RUNTIME_ROOT.glob(SYSERR_GLOB) if path.is_file())
|
||||
|
||||
|
||||
def count_incident_bundles() -> int:
|
||||
if not INCIDENT_ROOT.exists():
|
||||
return 0
|
||||
@@ -360,6 +376,37 @@ def filter_auth_events(hours: int, include_smoke: bool, status: str) -> list[dic
|
||||
return filtered
|
||||
|
||||
|
||||
def load_syserr_entries(hours: int) -> list[dict[str, object]]:
|
||||
cutoff = dt.datetime.now() - dt.timedelta(hours=hours)
|
||||
entries: list[dict[str, object]] = []
|
||||
|
||||
for path in iter_syserr_files():
|
||||
last_entry: dict[str, object] | None = None
|
||||
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
||||
for raw_line in handle:
|
||||
line = raw_line.replace("\x00", "").rstrip("\n")
|
||||
match = GENERIC_LOG_LINE_RE.match(line)
|
||||
if match:
|
||||
timestamp = parse_auth_timestamp(match.group("timestamp"))
|
||||
if timestamp < cutoff:
|
||||
last_entry = None
|
||||
continue
|
||||
|
||||
last_entry = {
|
||||
"time": timestamp,
|
||||
"level": match.group("level"),
|
||||
"source": str(path.relative_to(RUNTIME_ROOT)),
|
||||
"message": match.group("message").strip(),
|
||||
}
|
||||
entries.append(last_entry)
|
||||
continue
|
||||
|
||||
if last_entry is not None and line.strip():
|
||||
last_entry["message"] = f"{last_entry['message']} | {line.strip()}"
|
||||
|
||||
return [entry for entry in entries if str(entry["level"]).lower() == "error"]
|
||||
|
||||
|
||||
def run_mariadb_query(query: str) -> list[list[str]]:
|
||||
completed = run(["mariadb", "-N", "-B", "-e", query], require_root=True, capture_output=True)
|
||||
rows: list[list[str]] = []
|
||||
@@ -560,6 +607,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
||||
listening = live_ports()
|
||||
port_rows = iter_port_rows()
|
||||
auth_summary = summarize_auth_activity(hours, include_smoke)
|
||||
recent_errors = load_syserr_entries(hours)
|
||||
stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False)
|
||||
stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True)
|
||||
stale_orphan_count = max(stale_total_count - stale_session_count, 0)
|
||||
@@ -584,6 +632,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
||||
for row in port_rows
|
||||
],
|
||||
"auth": auth_summary,
|
||||
"recent_error_count": len(recent_errors),
|
||||
"latest_error": {
|
||||
"time": recent_errors[-1]["time"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": recent_errors[-1]["source"],
|
||||
"message": recent_errors[-1]["message"],
|
||||
} if recent_errors else None,
|
||||
"stale_open_sessions": {
|
||||
"user_count": stale_session_count,
|
||||
"orphan_count": stale_orphan_count,
|
||||
@@ -623,6 +677,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
||||
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
|
||||
print(f"core files: {core_count}")
|
||||
print(f"incident bundles: {incident_count}")
|
||||
print(f"recent syserr errors ({hours}h): {len(recent_errors)}")
|
||||
print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan")
|
||||
print()
|
||||
|
||||
@@ -648,6 +703,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
||||
f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} "
|
||||
f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}"
|
||||
)
|
||||
latest_error = payload["latest_error"]
|
||||
if latest_error:
|
||||
print(
|
||||
f"latest error: {latest_error['time']} "
|
||||
f"{latest_error['source']} {latest_error['message']}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -771,6 +832,94 @@ def print_auth_ips(hours: int, limit: int, include_smoke: bool, as_json: bool) -
|
||||
return 0
|
||||
|
||||
|
||||
def print_recent_errors(hours: int, limit: int, as_json: bool) -> int:
|
||||
entries = load_syserr_entries(hours)[-limit:]
|
||||
payload = {
|
||||
"window_hours": hours,
|
||||
"limit": limit,
|
||||
"count": len(entries),
|
||||
"entries": [
|
||||
{
|
||||
"time": entry["time"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(entry["source"]),
|
||||
"message": str(entry["message"]),
|
||||
}
|
||||
for entry in entries
|
||||
],
|
||||
}
|
||||
|
||||
if as_json:
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 0
|
||||
|
||||
if not entries:
|
||||
print(f"No syserr entries in the last {hours}h.")
|
||||
return 0
|
||||
|
||||
rows = [
|
||||
[entry["time"].strftime("%Y-%m-%d %H:%M:%S"), str(entry["source"]), str(entry["message"])]
|
||||
for entry in entries
|
||||
]
|
||||
print_table(["time", "source", "message"], rows)
|
||||
return 0
|
||||
|
||||
|
||||
def print_error_summary(hours: int, limit: int, as_json: bool) -> int:
|
||||
entries = load_syserr_entries(hours)
|
||||
grouped: dict[tuple[str, str], dict[str, object]] = {}
|
||||
|
||||
for entry in entries:
|
||||
key = (str(entry["source"]), str(entry["message"]))
|
||||
bucket = grouped.setdefault(
|
||||
key,
|
||||
{
|
||||
"source": str(entry["source"]),
|
||||
"message": str(entry["message"]),
|
||||
"count": 0,
|
||||
"last_seen": entry["time"],
|
||||
},
|
||||
)
|
||||
bucket["count"] = int(bucket["count"]) + 1
|
||||
if entry["time"] >= bucket["last_seen"]:
|
||||
bucket["last_seen"] = entry["time"]
|
||||
|
||||
rows = sorted(
|
||||
grouped.values(),
|
||||
key=lambda item: (int(item["count"]), item["last_seen"]),
|
||||
reverse=True,
|
||||
)[:limit]
|
||||
|
||||
payload = {
|
||||
"window_hours": hours,
|
||||
"limit": limit,
|
||||
"count": len(rows),
|
||||
"entries": [
|
||||
{
|
||||
"source": str(row["source"]),
|
||||
"count": int(row["count"]),
|
||||
"last_seen": row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"message": str(row["message"]),
|
||||
}
|
||||
for row in rows
|
||||
],
|
||||
}
|
||||
|
||||
if as_json:
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 0
|
||||
|
||||
if not rows:
|
||||
print(f"No syserr summary entries in the last {hours}h.")
|
||||
return 0
|
||||
|
||||
table_rows = [
|
||||
[str(row["count"]), row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"), str(row["source"]), str(row["message"])]
|
||||
for row in rows
|
||||
]
|
||||
print_table(["count", "last_seen", "source", "message"], table_rows)
|
||||
return 0
|
||||
|
||||
|
||||
def resolve_target_units(target: str) -> list[str]:
|
||||
normalized = target.strip().lower()
|
||||
|
||||
@@ -1058,6 +1207,10 @@ def main() -> int:
|
||||
return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json)
|
||||
if args.command == "auth-ips":
|
||||
return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json)
|
||||
if args.command == "recent-errors":
|
||||
return print_recent_errors(args.hours, args.limit, args.json)
|
||||
if args.command == "error-summary":
|
||||
return print_error_summary(args.hours, args.limit, args.json)
|
||||
if args.command == "status":
|
||||
return print_status(args.target)
|
||||
if args.command == "ports":
|
||||
|
||||
Reference in New Issue
Block a user