ops: add syserr triage views
This commit is contained in:
@@ -21,6 +21,7 @@ INCIDENT_COLLECTOR_PATH = Path("/usr/local/sbin/metin-collect-incident")
|
|||||||
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
INCIDENT_ROOT = Path("/var/lib/metin/incidents")
|
||||||
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
|
AUTH_SYSLOG_PATH = RUNTIME_ROOT / "channels" / "auth" / "syslog.log"
|
||||||
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
SOURCE_REPO_ROOT = RUNTIME_ROOT.parent.parent / "repos" / "m2dev-server-src"
|
||||||
|
SYSERR_GLOB = "channels/**/syserr.log"
|
||||||
|
|
||||||
sys.path.insert(0, str(REPO_ROOT))
|
sys.path.insert(0, str(REPO_ROOT))
|
||||||
|
|
||||||
@@ -28,6 +29,7 @@ import channel_inventory
|
|||||||
|
|
||||||
|
|
||||||
AUTH_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P<message>.*)$")
|
AUTH_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[[^]]+\] (?P<message>.*)$")
|
||||||
|
GENERIC_LOG_LINE_RE = re.compile(r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[(?P<level>[^]]+)\] (?P<message>.*)$")
|
||||||
AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P<ip>[^]]+)\].* ptr (?P<desc>0x[0-9a-fA-F]+)")
|
AUTH_CONN_RE = re.compile(r"SYSTEM: new connection from \[(?P<ip>[^]]+)\].* ptr (?P<desc>0x[0-9a-fA-F]+)")
|
||||||
AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P<login>[^(]+)\(\d+\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
AUTH_LOGIN_RE = re.compile(r"InputAuth::Login : (?P<login>[^(]+)\(\d+\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
||||||
AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P<login>[^)]+)\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
AUTH_INVALID_LOGIN_RE = re.compile(r"InputAuth::Login : IS_NOT_VALID_LOGIN_STRING\((?P<login>[^)]+)\) desc (?P<desc>0x[0-9a-fA-F]+)")
|
||||||
@@ -64,6 +66,16 @@ def parse_args() -> argparse.Namespace:
|
|||||||
auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
|
auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
|
||||||
auth_ips.add_argument("--json", action="store_true", help="Print raw JSON")
|
auth_ips.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||||
|
|
||||||
|
recent_errors = subparsers.add_parser("recent-errors", help="Show recent syserr entries across runtime components")
|
||||||
|
recent_errors.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||||
|
recent_errors.add_argument("--limit", type=int, default=30, help="Maximum errors to show")
|
||||||
|
recent_errors.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||||
|
|
||||||
|
error_summary = subparsers.add_parser("error-summary", help="Summarize recurring syserr entries")
|
||||||
|
error_summary.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||||
|
error_summary.add_argument("--limit", type=int, default=20, help="Maximum grouped errors to show")
|
||||||
|
error_summary.add_argument("--json", action="store_true", help="Print raw JSON")
|
||||||
|
|
||||||
sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2")
|
sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2")
|
||||||
sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
|
||||||
sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
|
sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
|
||||||
@@ -214,6 +226,10 @@ def iter_core_files() -> list[Path]:
|
|||||||
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
|
return [path for path in sorted(RUNTIME_ROOT.glob("channels/**/core*")) if path.is_file()]
|
||||||
|
|
||||||
|
|
||||||
|
def iter_syserr_files() -> list[Path]:
|
||||||
|
return sorted(path for path in RUNTIME_ROOT.glob(SYSERR_GLOB) if path.is_file())
|
||||||
|
|
||||||
|
|
||||||
def count_incident_bundles() -> int:
|
def count_incident_bundles() -> int:
|
||||||
if not INCIDENT_ROOT.exists():
|
if not INCIDENT_ROOT.exists():
|
||||||
return 0
|
return 0
|
||||||
@@ -360,6 +376,37 @@ def filter_auth_events(hours: int, include_smoke: bool, status: str) -> list[dic
|
|||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def load_syserr_entries(hours: int) -> list[dict[str, object]]:
|
||||||
|
cutoff = dt.datetime.now() - dt.timedelta(hours=hours)
|
||||||
|
entries: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
for path in iter_syserr_files():
|
||||||
|
last_entry: dict[str, object] | None = None
|
||||||
|
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
||||||
|
for raw_line in handle:
|
||||||
|
line = raw_line.replace("\x00", "").rstrip("\n")
|
||||||
|
match = GENERIC_LOG_LINE_RE.match(line)
|
||||||
|
if match:
|
||||||
|
timestamp = parse_auth_timestamp(match.group("timestamp"))
|
||||||
|
if timestamp < cutoff:
|
||||||
|
last_entry = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
last_entry = {
|
||||||
|
"time": timestamp,
|
||||||
|
"level": match.group("level"),
|
||||||
|
"source": str(path.relative_to(RUNTIME_ROOT)),
|
||||||
|
"message": match.group("message").strip(),
|
||||||
|
}
|
||||||
|
entries.append(last_entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if last_entry is not None and line.strip():
|
||||||
|
last_entry["message"] = f"{last_entry['message']} | {line.strip()}"
|
||||||
|
|
||||||
|
return [entry for entry in entries if str(entry["level"]).lower() == "error"]
|
||||||
|
|
||||||
|
|
||||||
def run_mariadb_query(query: str) -> list[list[str]]:
|
def run_mariadb_query(query: str) -> list[list[str]]:
|
||||||
completed = run(["mariadb", "-N", "-B", "-e", query], require_root=True, capture_output=True)
|
completed = run(["mariadb", "-N", "-B", "-e", query], require_root=True, capture_output=True)
|
||||||
rows: list[list[str]] = []
|
rows: list[list[str]] = []
|
||||||
@@ -560,6 +607,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
|||||||
listening = live_ports()
|
listening = live_ports()
|
||||||
port_rows = iter_port_rows()
|
port_rows = iter_port_rows()
|
||||||
auth_summary = summarize_auth_activity(hours, include_smoke)
|
auth_summary = summarize_auth_activity(hours, include_smoke)
|
||||||
|
recent_errors = load_syserr_entries(hours)
|
||||||
stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False)
|
stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False)
|
||||||
stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True)
|
stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True)
|
||||||
stale_orphan_count = max(stale_total_count - stale_session_count, 0)
|
stale_orphan_count = max(stale_total_count - stale_session_count, 0)
|
||||||
@@ -584,6 +632,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
|||||||
for row in port_rows
|
for row in port_rows
|
||||||
],
|
],
|
||||||
"auth": auth_summary,
|
"auth": auth_summary,
|
||||||
|
"recent_error_count": len(recent_errors),
|
||||||
|
"latest_error": {
|
||||||
|
"time": recent_errors[-1]["time"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"source": recent_errors[-1]["source"],
|
||||||
|
"message": recent_errors[-1]["message"],
|
||||||
|
} if recent_errors else None,
|
||||||
"stale_open_sessions": {
|
"stale_open_sessions": {
|
||||||
"user_count": stale_session_count,
|
"user_count": stale_session_count,
|
||||||
"orphan_count": stale_orphan_count,
|
"orphan_count": stale_orphan_count,
|
||||||
@@ -623,6 +677,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
|||||||
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
|
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
|
||||||
print(f"core files: {core_count}")
|
print(f"core files: {core_count}")
|
||||||
print(f"incident bundles: {incident_count}")
|
print(f"incident bundles: {incident_count}")
|
||||||
|
print(f"recent syserr errors ({hours}h): {len(recent_errors)}")
|
||||||
print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan")
|
print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
@@ -648,6 +703,12 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
|
|||||||
f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} "
|
f"latest failure: {latest_failure['time'].strftime('%Y-%m-%d %H:%M:%S')} "
|
||||||
f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}"
|
f"{latest_failure['login']} from {latest_failure['ip']} reason={latest_failure['reason']}"
|
||||||
)
|
)
|
||||||
|
latest_error = payload["latest_error"]
|
||||||
|
if latest_error:
|
||||||
|
print(
|
||||||
|
f"latest error: {latest_error['time']} "
|
||||||
|
f"{latest_error['source']} {latest_error['message']}"
|
||||||
|
)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@@ -771,6 +832,94 @@ def print_auth_ips(hours: int, limit: int, include_smoke: bool, as_json: bool) -
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def print_recent_errors(hours: int, limit: int, as_json: bool) -> int:
|
||||||
|
entries = load_syserr_entries(hours)[-limit:]
|
||||||
|
payload = {
|
||||||
|
"window_hours": hours,
|
||||||
|
"limit": limit,
|
||||||
|
"count": len(entries),
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"time": entry["time"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"source": str(entry["source"]),
|
||||||
|
"message": str(entry["message"]),
|
||||||
|
}
|
||||||
|
for entry in entries
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
if as_json:
|
||||||
|
print(json.dumps(payload, indent=2))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if not entries:
|
||||||
|
print(f"No syserr entries in the last {hours}h.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
rows = [
|
||||||
|
[entry["time"].strftime("%Y-%m-%d %H:%M:%S"), str(entry["source"]), str(entry["message"])]
|
||||||
|
for entry in entries
|
||||||
|
]
|
||||||
|
print_table(["time", "source", "message"], rows)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def print_error_summary(hours: int, limit: int, as_json: bool) -> int:
|
||||||
|
entries = load_syserr_entries(hours)
|
||||||
|
grouped: dict[tuple[str, str], dict[str, object]] = {}
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
key = (str(entry["source"]), str(entry["message"]))
|
||||||
|
bucket = grouped.setdefault(
|
||||||
|
key,
|
||||||
|
{
|
||||||
|
"source": str(entry["source"]),
|
||||||
|
"message": str(entry["message"]),
|
||||||
|
"count": 0,
|
||||||
|
"last_seen": entry["time"],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bucket["count"] = int(bucket["count"]) + 1
|
||||||
|
if entry["time"] >= bucket["last_seen"]:
|
||||||
|
bucket["last_seen"] = entry["time"]
|
||||||
|
|
||||||
|
rows = sorted(
|
||||||
|
grouped.values(),
|
||||||
|
key=lambda item: (int(item["count"]), item["last_seen"]),
|
||||||
|
reverse=True,
|
||||||
|
)[:limit]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"window_hours": hours,
|
||||||
|
"limit": limit,
|
||||||
|
"count": len(rows),
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"source": str(row["source"]),
|
||||||
|
"count": int(row["count"]),
|
||||||
|
"last_seen": row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"message": str(row["message"]),
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
if as_json:
|
||||||
|
print(json.dumps(payload, indent=2))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print(f"No syserr summary entries in the last {hours}h.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
table_rows = [
|
||||||
|
[str(row["count"]), row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"), str(row["source"]), str(row["message"])]
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
print_table(["count", "last_seen", "source", "message"], table_rows)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def resolve_target_units(target: str) -> list[str]:
|
def resolve_target_units(target: str) -> list[str]:
|
||||||
normalized = target.strip().lower()
|
normalized = target.strip().lower()
|
||||||
|
|
||||||
@@ -1058,6 +1207,10 @@ def main() -> int:
|
|||||||
return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json)
|
return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json)
|
||||||
if args.command == "auth-ips":
|
if args.command == "auth-ips":
|
||||||
return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json)
|
return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json)
|
||||||
|
if args.command == "recent-errors":
|
||||||
|
return print_recent_errors(args.hours, args.limit, args.json)
|
||||||
|
if args.command == "error-summary":
|
||||||
|
return print_error_summary(args.hours, args.limit, args.json)
|
||||||
if args.command == "status":
|
if args.command == "status":
|
||||||
return print_status(args.target)
|
return print_status(args.target)
|
||||||
if args.command == "ports":
|
if args.command == "ports":
|
||||||
|
|||||||
@@ -35,6 +35,8 @@ The Debian deployment installs:
|
|||||||
- showing an operational summary
|
- showing an operational summary
|
||||||
- showing recent auth success/failure activity
|
- showing recent auth success/failure activity
|
||||||
- showing auth activity grouped by source IP
|
- showing auth activity grouped by source IP
|
||||||
|
- showing recent `syserr.log` entries
|
||||||
|
- summarizing recurring `syserr.log` entries
|
||||||
- viewing inventory
|
- viewing inventory
|
||||||
- listing managed units
|
- listing managed units
|
||||||
- checking service status
|
- checking service status
|
||||||
@@ -99,6 +101,18 @@ Show auth activity grouped by IP:
|
|||||||
metinctl auth-ips
|
metinctl auth-ips
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Show the latest runtime errors collected from all `syserr.log` files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
metinctl recent-errors
|
||||||
|
```
|
||||||
|
|
||||||
|
Show the most repeated runtime errors in the last 24 hours:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
metinctl error-summary
|
||||||
|
```
|
||||||
|
|
||||||
Include smoke-test failures too:
|
Include smoke-test failures too:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
Reference in New Issue
Block a user