ops: add auth IP and stale session audit

This commit is contained in:
server
2026-04-14 16:13:47 +02:00
parent f722475f17
commit cd2e1d61ca
2 changed files with 227 additions and 0 deletions

View File

@@ -58,6 +58,12 @@ def parse_args() -> argparse.Namespace:
auth_activity.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
auth_activity.add_argument("--json", action="store_true", help="Print raw JSON")
auth_ips = subparsers.add_parser("auth-ips", help="Summarize auth activity by source IP")
auth_ips.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
auth_ips.add_argument("--limit", type=int, default=20, help="Maximum IPs to show")
auth_ips.add_argument("--include-smoke", action="store_true", help="Include smoke-test logins")
auth_ips.add_argument("--json", action="store_true", help="Print raw JSON")
sessions = subparsers.add_parser("sessions", help="Show recent login sessions from loginlog2")
sessions.add_argument("--hours", type=int, default=24, help="How many hours back to inspect")
sessions.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
@@ -65,6 +71,13 @@ def parse_args() -> argparse.Namespace:
sessions.add_argument("--include-orphans", action="store_true", help="Include rows whose account login no longer exists")
sessions.add_argument("--json", action="store_true", help="Print raw JSON")
session_audit = subparsers.add_parser("session-audit", help="Show stale open sessions without logout")
session_audit.add_argument("--hours", type=int, default=72, help="How many hours back to inspect")
session_audit.add_argument("--stale-minutes", type=int, default=30, help="Minimum age for an open session to be considered stale")
session_audit.add_argument("--limit", type=int, default=20, help="Maximum sessions to show")
session_audit.add_argument("--include-orphans", action="store_true", help="Include rows whose account login no longer exists")
session_audit.add_argument("--json", action="store_true", help="Print raw JSON")
status_parser = subparsers.add_parser("status", help="Show current unit state")
status_parser.add_argument("target", nargs="?", default="all", help="stack, db, auth, game, channel:<id>, instance:<name>")
@@ -402,6 +415,75 @@ LIMIT {int(limit)}
return entries
def fetch_stale_sessions(hours: int, stale_minutes: int, limit: int, include_orphans: bool) -> list[dict[str, str]]:
where_clauses = [
f"l.login_time >= NOW() - INTERVAL {int(hours)} HOUR",
"l.logout_time IS NULL",
f"TIMESTAMPDIFF(MINUTE, l.login_time, NOW()) >= {int(stale_minutes)}",
]
if not include_orphans:
where_clauses.append("a.login IS NOT NULL")
query = f"""
SELECT
DATE_FORMAT(l.login_time, '%Y-%m-%d %H:%i:%s'),
l.type,
COALESCE(a.login, ''),
l.account_id,
l.pid,
COALESCE(INET_NTOA(l.ip), ''),
TIMESTAMPDIFF(MINUTE, l.login_time, NOW())
FROM log.loginlog2 l
LEFT JOIN account.account a ON a.id = l.account_id
WHERE {' AND '.join(where_clauses)}
ORDER BY l.login_time DESC
LIMIT {int(limit)}
""".strip()
entries: list[dict[str, str]] = []
for row in run_mariadb_query(query):
while len(row) < 7:
row.append("")
login_time, raw_type, login, account_id, pid, ip, age_minutes = row[:7]
entries.append(
{
"login_time": login_time,
"raw_type": raw_type,
"login": login or f"<missing:{account_id}>",
"account_id": account_id,
"pid": pid,
"ip": ip or "-",
"age_minutes": age_minutes or "0",
}
)
return entries
def count_stale_sessions(hours: int, stale_minutes: int, include_orphans: bool) -> int:
where_clauses = [
f"l.login_time >= NOW() - INTERVAL {int(hours)} HOUR",
"l.logout_time IS NULL",
f"TIMESTAMPDIFF(MINUTE, l.login_time, NOW()) >= {int(stale_minutes)}",
]
if not include_orphans:
where_clauses.append("a.login IS NOT NULL")
query = f"""
SELECT COUNT(*)
FROM log.loginlog2 l
LEFT JOIN account.account a ON a.id = l.account_id
WHERE {' AND '.join(where_clauses)}
""".strip()
rows = run_mariadb_query(query)
if not rows or not rows[0]:
return 0
try:
return int(rows[0][0])
except ValueError:
return 0
def live_ports() -> set[int]:
if shutil.which("ss") is None:
return set()
@@ -478,6 +560,9 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
listening = live_ports()
port_rows = iter_port_rows()
auth_summary = summarize_auth_activity(hours, include_smoke)
stale_session_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=False)
stale_total_count = count_stale_sessions(hours=max(hours, 1), stale_minutes=30, include_orphans=True)
stale_orphan_count = max(stale_total_count - stale_session_count, 0)
repos = {
"m2dev-server": git_summary(REPO_ROOT),
"m2dev-server-src": git_summary(SOURCE_REPO_ROOT),
@@ -499,6 +584,11 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
for row in port_rows
],
"auth": auth_summary,
"stale_open_sessions": {
"user_count": stale_session_count,
"orphan_count": stale_orphan_count,
"total_count": stale_total_count,
},
"core_count": core_count,
"incident_count": incident_count,
}
@@ -533,6 +623,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
print(f"game instances active: {game_active}/{len(enabled_game_units)} enabled ({len(game_units)} declared)")
print(f"core files: {core_count}")
print(f"incident bundles: {incident_count}")
print(f"stale open sessions (>30m): {stale_session_count} user, {stale_orphan_count} orphan")
print()
print("Public Ports")
@@ -603,6 +694,83 @@ def print_auth_activity(hours: int, limit: int, status: str, include_smoke: bool
return 0
def print_auth_ips(hours: int, limit: int, include_smoke: bool, as_json: bool) -> int:
events = filter_auth_events(hours, include_smoke, "all")
grouped: dict[str, dict[str, object]] = {}
for event in events:
ip = str(event["ip"])
bucket = grouped.setdefault(
ip,
{
"ip": ip,
"success_count": 0,
"failure_count": 0,
"last_seen": event["time"],
"last_login": str(event["login"]),
"last_reason": str(event["reason"]),
},
)
if event["status"] == "success":
bucket["success_count"] = int(bucket["success_count"]) + 1
else:
bucket["failure_count"] = int(bucket["failure_count"]) + 1
if event["time"] >= bucket["last_seen"]:
bucket["last_seen"] = event["time"]
bucket["last_login"] = str(event["login"])
bucket["last_reason"] = str(event["reason"])
rows = sorted(
grouped.values(),
key=lambda item: (
int(item["failure_count"]),
int(item["success_count"]),
item["last_seen"],
),
reverse=True,
)[:limit]
payload = {
"window_hours": hours,
"limit": limit,
"include_smoke": include_smoke,
"count": len(rows),
"entries": [
{
"ip": str(row["ip"]),
"success_count": int(row["success_count"]),
"failure_count": int(row["failure_count"]),
"last_seen": row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"),
"last_login": str(row["last_login"]),
"last_reason": str(row["last_reason"]),
}
for row in rows
],
}
if as_json:
print(json.dumps(payload, indent=2))
return 0
if not rows:
print(f"No auth IP activity in the last {hours}h.")
return 0
table_rows = [
[
str(row["ip"]),
str(row["success_count"]),
str(row["failure_count"]),
row["last_seen"].strftime("%Y-%m-%d %H:%M:%S"),
str(row["last_login"]),
str(row["last_reason"]),
]
for row in rows
]
print_table(["ip", "success", "failure", "last_seen", "last_login", "last_reason"], table_rows)
return 0
def resolve_target_units(target: str) -> list[str]:
normalized = target.strip().lower()
@@ -797,6 +965,41 @@ def print_sessions(hours: int, limit: int, active_only: bool, include_orphans: b
return 0
def print_session_audit(hours: int, stale_minutes: int, limit: int, include_orphans: bool, as_json: bool) -> int:
entries = fetch_stale_sessions(hours, stale_minutes, limit, include_orphans)
payload = {
"window_hours": hours,
"stale_minutes": stale_minutes,
"limit": limit,
"include_orphans": include_orphans,
"count": len(entries),
"entries": entries,
}
if as_json:
print(json.dumps(payload, indent=2))
return 0
if not entries:
print(f"No stale open sessions older than {stale_minutes} minutes in the last {hours}h.")
return 0
rows = [
[
entry["login_time"],
entry["age_minutes"],
entry["login"],
entry["account_id"],
entry["pid"],
entry["ip"],
entry["raw_type"],
]
for entry in entries
]
print_table(["login_time", "age_min", "login", "account", "pid", "ip", "raw_type"], rows)
return 0
def run_healthcheck(mode: str) -> int:
if not HEALTHCHECK_PATH.exists():
raise SystemExit(f"Missing healthcheck wrapper: {HEALTHCHECK_PATH}")
@@ -853,6 +1056,8 @@ def main() -> int:
return print_summary(args.hours, args.include_smoke, args.json)
if args.command == "auth-activity":
return print_auth_activity(args.hours, args.limit, args.status, args.include_smoke, args.json)
if args.command == "auth-ips":
return print_auth_ips(args.hours, args.limit, args.include_smoke, args.json)
if args.command == "status":
return print_status(args.target)
if args.command == "ports":
@@ -865,6 +1070,8 @@ def main() -> int:
return print_auth_failures(args.hours, args.limit, args.include_smoke, args.json)
if args.command == "sessions":
return print_sessions(args.hours, args.limit, args.active_only, args.include_orphans, args.json)
if args.command == "session-audit":
return print_session_audit(args.hours, args.stale_minutes, args.limit, args.include_orphans, args.json)
if args.command in {"start", "stop", "restart"}:
return run_unit_action(args.command, args.target)
if args.command == "logs":

View File

@@ -34,12 +34,14 @@ The Debian deployment installs:
- showing an operational summary
- showing recent auth success/failure activity
- showing auth activity grouped by source IP
- viewing inventory
- listing managed units
- checking service status
- listing declared ports
- listing recent auth failures
- listing recent login sessions
- listing stale open sessions without logout
- restarting the whole stack or specific channels/instances
- viewing logs
- listing core files in the runtime tree
@@ -91,6 +93,12 @@ Show only recent auth failures including smoke tests:
metinctl auth-activity --status failure --include-smoke
```
Show auth activity grouped by IP:
```bash
metinctl auth-ips
```
Include smoke-test failures too:
```bash
@@ -109,6 +117,18 @@ Show only sessions that still have no recorded logout:
metinctl sessions --active-only
```
Show stale open sessions older than 30 minutes:
```bash
metinctl session-audit
```
Use a different stale threshold:
```bash
metinctl session-audit --stale-minutes 10
```
Restart only channel 1 cores:
```bash