Guard public channel readiness in systemd tooling

2026-04-15 17:46:56 +02:00
parent 6f16f66543
commit 2179c46ce0
6 changed files with 180 additions and 2 deletions
--- a/channel_inventory.py
+++ b/channel_inventory.py
@@ -51,6 +51,35 @@ def get_channel_ids() -> list[int]:
    return [int(channel["id"]) for channel in iter_channels()]


+def get_public_channel_ids(
+    selected_channel_ids: Iterable[int] | None = None,
+    *,
+    client_visible_only: bool = False,
+) -> list[int]:
+    selected = None if selected_channel_ids is None else {int(channel_id) for channel_id in selected_channel_ids}
+    result: list[int] = []
+
+    for channel in iter_channels():
+        channel_id = int(channel["id"])
+        if selected is not None and channel_id not in selected:
+            continue
+        if not channel.get("public"):
+            continue
+        if client_visible_only and not channel.get("client_visible"):
+            continue
+        result.append(channel_id)
+
+    return result
+
+
+def has_public_channel(
+    selected_channel_ids: Iterable[int] | None = None,
+    *,
+    client_visible_only: bool = False,
+) -> bool:
+    return bool(get_public_channel_ids(selected_channel_ids, client_visible_only=client_visible_only))
+
+
 def get_channel_map() -> dict[int, dict[int, str]]:
    result: dict[int, dict[int, str]] = {}
    for channel in iter_channels():
--- a/deploy/systemd/README.md
+++ b/deploy/systemd/README.md
@@ -20,6 +20,8 @@ python3 deploy/systemd/install_systemd.py \

 `--channel-limit 1` is also supported and will auto-include channel `99` when present in the channel inventory.

+By default the installer refuses channel selections that omit every client-visible public channel. If you intentionally want an auth/internal-only stack, pass `--allow-internal-only`.
+
 The channel selection and port layout now come from the versioned inventory file:

 - [deploy/channel-inventory.json](../channel-inventory.json)
--- a/deploy/systemd/bin/metinctl.in
+++ b/deploy/systemd/bin/metinctl.in
@@ -97,6 +97,9 @@ def parse_args() -> argparse.Namespace:
    ports_parser = subparsers.add_parser("ports", help="Show declared listener ports")
    ports_parser.add_argument("--live", action="store_true", help="Also show whether the port is currently listening")

+    public_ready = subparsers.add_parser("public-ready", help="Verify enabled client-visible public channels are active and listening")
+    public_ready.add_argument("--json", action="store_true", help="Print raw JSON")
+
    for action in ("start", "stop", "restart"):
        action_parser = subparsers.add_parser(action, help=f"{action.title()} a managed target")
        action_parser.add_argument("target", help="stack, db, auth, game, channel:<id>, instance:<name>")
@@ -196,6 +199,7 @@ def iter_port_rows() -> list[dict[str, str]]:
            "p2p_port": "-",
            "unit": channel_inventory.DB_UNIT,
            "visibility": "internal",
+            "client_visible": False,
        },
        {
            "scope": "auth",
@@ -204,6 +208,7 @@ def iter_port_rows() -> list[dict[str, str]]:
            "p2p_port": str(channel_inventory.get_auth()["p2p_port"]),
            "unit": channel_inventory.AUTH_UNIT,
            "visibility": "public",
+            "client_visible": False,
        },
    ]

@@ -221,6 +226,7 @@ def iter_port_rows() -> list[dict[str, str]]:
                    "p2p_port": str(core["p2p_port"]),
                    "unit": channel_inventory.game_unit(instance),
                    "visibility": visibility,
+                    "client_visible": bool(channel.get("client_visible")),
                }
            )

@@ -593,6 +599,60 @@ def print_units() -> int:
    return 0


+def public_runtime_report() -> dict[str, object]:
+    listening = live_ports()
+    entries: list[dict[str, object]] = []
+
+    for row in iter_port_rows():
+        if not row["scope"].startswith("channel:"):
+            continue
+        if row["visibility"] != "public":
+            continue
+        if not row["client_visible"]:
+            continue
+
+        active, sub_state, enabled = get_unit_state(row["unit"])
+        live = int(row["port"]) in listening
+        entries.append(
+            {
+                "scope": row["scope"],
+                "name": row["name"],
+                "port": int(row["port"]),
+                "p2p_port": int(row["p2p_port"]),
+                "unit": row["unit"],
+                "active": active,
+                "sub": sub_state,
+                "enabled": enabled,
+                "live": live,
+            }
+        )
+
+    enabled_entries = [entry for entry in entries if entry["enabled"] == "enabled"]
+    issues: list[str] = []
+
+    if not enabled_entries:
+        issues.append("No client-visible public channel units are enabled.")
+
+    for entry in enabled_entries:
+        if entry["active"] != "active":
+            issues.append(f"{entry['name']} unit is {entry['active']}/{entry['sub']}.")
+        if not entry["live"]:
+            issues.append(f"{entry['name']} port {entry['port']} is not listening.")
+
+    ready = not issues
+    live_enabled = sum(1 for entry in enabled_entries if entry["live"])
+    active_enabled = sum(1 for entry in enabled_entries if entry["active"] == "active")
+    return {
+        "ready": ready,
+        "declared_count": len(entries),
+        "enabled_count": len(enabled_entries),
+        "active_enabled_count": active_enabled,
+        "live_enabled_count": live_enabled,
+        "entries": entries,
+        "issues": issues,
+    }
+
+
 def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
    units = [
        channel_inventory.STACK_UNIT,
@@ -622,6 +682,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
    }
    incident_count = count_incident_bundles()
    core_count = len(iter_core_files())
+    public_runtime = public_runtime_report()

    payload = {
        "repos": repos,
@@ -648,6 +709,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
            "orphan_count": stale_orphan_count,
            "total_count": stale_total_count,
        },
+        "public_runtime": public_runtime,
        "core_count": core_count,
        "incident_count": incident_count,
    }
@@ -688,6 +750,15 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:

    print("Public Ports")
    print_table(["name", "port", "p2p", "live"], public_port_rows)
+    print(
+        "client-visible public channels ready: "
+        f"{'yes' if public_runtime['ready'] else 'no'} "
+        f"({public_runtime['active_enabled_count']}/{public_runtime['enabled_count']} enabled units active, "
+        f"{public_runtime['live_enabled_count']}/{public_runtime['enabled_count']} listening)"
+    )
+    if public_runtime["issues"]:
+        for issue in public_runtime["issues"]:
+            print(f"warning: {issue}")
    print()

    print(f"Auth ({hours}h)")
@@ -983,6 +1054,41 @@ def print_ports(show_live: bool) -> int:
    return 0


+def print_public_ready(as_json: bool) -> int:
+    payload = public_runtime_report()
+
+    if as_json:
+        print(json.dumps(payload, indent=2))
+    else:
+        if payload["entries"]:
+            rows = [
+                [
+                    str(entry["name"]),
+                    str(entry["port"]),
+                    str(entry["active"]),
+                    str(entry["sub"]),
+                    str(entry["enabled"]),
+                    "yes" if entry["live"] else "no",
+                ]
+                for entry in payload["entries"]
+            ]
+            print_table(["name", "port", "active", "sub", "enabled", "live"], rows)
+        else:
+            print("No client-visible public channels declared in the inventory.")
+
+        print()
+        print(
+            "ready: "
+            f"{'yes' if payload['ready'] else 'no'} "
+            f"({payload['active_enabled_count']}/{payload['enabled_count']} enabled units active, "
+            f"{payload['live_enabled_count']}/{payload['enabled_count']} listening)"
+        )
+        for issue in payload["issues"]:
+            print(f"issue: {issue}")
+
+    return 0 if payload["ready"] else 1
+
+
 def print_cores(as_json: bool) -> int:
    entries = []
    for path in iter_core_files():
@@ -1172,6 +1278,15 @@ def run_wait_ready(timeout_seconds: int, interval_seconds: float) -> int:
    while time.time() < deadline:
        attempt += 1
        print(f"Healthcheck attempt {attempt}...")
+        public_runtime = public_runtime_report()
+        if not public_runtime["ready"]:
+            for issue in public_runtime["issues"]:
+                print(f"Public runtime not ready: {issue}")
+            remaining = deadline - time.time()
+            if remaining <= 0:
+                break
+            time.sleep(min(interval_seconds, remaining))
+            continue
        completed = subprocess.run(
            build_command([str(HEALTHCHECK_PATH), "--mode", "ready"], require_root=True),
            check=False,
@@ -1233,6 +1348,8 @@ def main() -> int:
        return print_status(args.target)
    if args.command == "ports":
        return print_ports(args.live)
+    if args.command == "public-ready":
+        return print_public_ready(args.json)
    if args.command == "cores":
        return print_cores(args.json)
    if args.command == "incidents":
--- a/deploy/systemd/install_systemd.py
+++ b/deploy/systemd/install_systemd.py
@@ -32,6 +32,11 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--wait-port", type=int, default=9000, help="DB readiness port")
    parser.add_argument("--wait-timeout", type=int, default=30, help="DB readiness timeout in seconds")
    parser.add_argument("--restart", action="store_true", help="Restart metin-server.service after install")
+    parser.add_argument(
+        "--allow-internal-only",
+        action="store_true",
+        help="Allow installs that omit every client-visible public channel",
+    )

    channel_group = parser.add_mutually_exclusive_group(required=True)
    channel_group.add_argument(
@@ -76,7 +81,7 @@ def copy_file(source: Path, destination: Path, mode: int) -> None:

 def resolve_channels(args: argparse.Namespace) -> list[int]:
    try:
-        return channel_inventory.resolve_selected_channels(
+        selected_channels = channel_inventory.resolve_selected_channels(
            channel_limit=args.channel_limit,
            explicit_channels=args.channels,
        )
@@ -84,6 +89,20 @@ def resolve_channels(args: argparse.Namespace) -> list[int]:
        print(str(exc), file=sys.stderr)
        raise SystemExit(1)

+    if not args.allow_internal_only and not channel_inventory.has_public_channel(
+        selected_channels,
+        client_visible_only=True,
+    ):
+        print(
+            "Selected channels do not include any client-visible public channel. "
+            "Add a public channel such as --channel 1, or pass --allow-internal-only "
+            "if an auth/internal-only stack is intentional.",
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
+    return selected_channels
+

 def resolve_instances(selected_channels: list[int]) -> list[str]:
    return channel_inventory.get_instances(selected_channels)
--- a/docs/healthchecks.md
+++ b/docs/healthchecks.md
@@ -132,12 +132,15 @@ Useful direct flags:
 Operational CLI:

 ```bash
+metinctl public-ready
 metinctl healthcheck --mode full
 metinctl healthcheck --mode ready
 metinctl wait-ready
 ```

-`metinctl wait-ready` now uses the lighter `ready` mode on purpose. The deeper `full` mode remains available as an explicit admin healthcheck.
+`metinctl public-ready` verifies that every enabled client-visible public channel unit is active and that its declared listener port is actually up.
+
+`metinctl wait-ready` now first waits for the public runtime to be up and only then runs the lighter `ready` login probe. The deeper `full` mode remains available as an explicit admin healthcheck.

 Example negative auth test:

--- a/docs/server-management.md
+++ b/docs/server-management.md
@@ -41,6 +41,7 @@ The Debian deployment installs:
 - listing managed units
 - checking service status
 - listing declared ports
+- verifying that enabled public client-facing channels are actually up
 - listing recent auth failures
 - listing recent login sessions
 - listing stale open sessions without logout
@@ -78,6 +79,12 @@ Show declared ports and whether they are currently listening:
 metinctl ports --live
 ```

+Verify that enabled client-visible public channels are active and listening:
+
+```bash
+metinctl public-ready
+```
+
 Show recent real auth failures and skip smoke-test logins:

 ```bash
@@ -219,6 +226,7 @@ It also reconciles enabled game instance units against the selected channels:
 - selected game units are enabled
 - stale game units are disabled
 - if `--restart` is passed, stale game units are disabled with `--now`
+- installs now refuse an auth/internal-only channel selection unless you pass `--allow-internal-only`

 This makes channel enablement declarative instead of depending on whatever happened to be enabled previously.