Guard public channel readiness in systemd tooling

2026-04-15 17:46:56 +02:00
parent 6f16f66543
commit 2179c46ce0
6 changed files with 180 additions and 2 deletions
--- a/channel_inventory.py
+++ b/channel_inventory.py
@@ -51,6 +51,35 @@ def get_channel_ids() -> list[int]:
    return [int(channel["id"]) for channel in iter_channels()]
 def get_public_channel_ids(
    selected_channel_ids: Iterable[int] | None = None,
    *,
    client_visible_only: bool = False,
 ) -> list[int]:
    selected = None if selected_channel_ids is None else {int(channel_id) for channel_id in selected_channel_ids}
    result: list[int] = []
    for channel in iter_channels():
        channel_id = int(channel["id"])
        if selected is not None and channel_id not in selected:
            continue
        if not channel.get("public"):
            continue
        if client_visible_only and not channel.get("client_visible"):
            continue
        result.append(channel_id)
    return result
 def has_public_channel(
    selected_channel_ids: Iterable[int] | None = None,
    *,
    client_visible_only: bool = False,
 ) -> bool:
    return bool(get_public_channel_ids(selected_channel_ids, client_visible_only=client_visible_only))
 def get_channel_map() -> dict[int, dict[int, str]]:
    result: dict[int, dict[int, str]] = {}
    for channel in iter_channels():
--- a/deploy/systemd/README.md
+++ b/deploy/systemd/README.md
@@ -20,6 +20,8 @@ python3 deploy/systemd/install_systemd.py \
 `--channel-limit 1` is also supported and will auto-include channel `99` when present in the channel inventory.
 By default the installer refuses channel selections that omit every client-visible public channel. If you intentionally want an auth/internal-only stack, pass `--allow-internal-only`.
 The channel selection and port layout now come from the versioned inventory file:
 - [deploy/channel-inventory.json](../channel-inventory.json)
--- a/deploy/systemd/bin/metinctl.in
+++ b/deploy/systemd/bin/metinctl.in
@@ -97,6 +97,9 @@ def parse_args() -> argparse.Namespace:
    ports_parser = subparsers.add_parser("ports", help="Show declared listener ports")
    ports_parser.add_argument("--live", action="store_true", help="Also show whether the port is currently listening")
    public_ready = subparsers.add_parser("public-ready", help="Verify enabled client-visible public channels are active and listening")
    public_ready.add_argument("--json", action="store_true", help="Print raw JSON")
    for action in ("start", "stop", "restart"):
        action_parser = subparsers.add_parser(action, help=f"{action.title()} a managed target")
        action_parser.add_argument("target", help="stack, db, auth, game, channel:<id>, instance:<name>")
@@ -196,6 +199,7 @@ def iter_port_rows() -> list[dict[str, str]]:
            "p2p_port": "-",
            "unit": channel_inventory.DB_UNIT,
            "visibility": "internal",
            "client_visible": False,
        },
        {
            "scope": "auth",
@@ -204,6 +208,7 @@ def iter_port_rows() -> list[dict[str, str]]:
            "p2p_port": str(channel_inventory.get_auth()["p2p_port"]),
            "unit": channel_inventory.AUTH_UNIT,
            "visibility": "public",
            "client_visible": False,
        },
    ]
@@ -221,6 +226,7 @@ def iter_port_rows() -> list[dict[str, str]]:
                    "p2p_port": str(core["p2p_port"]),
                    "unit": channel_inventory.game_unit(instance),
                    "visibility": visibility,
                    "client_visible": bool(channel.get("client_visible")),
                }
            )
@@ -593,6 +599,60 @@ def print_units() -> int:
    return 0
 def public_runtime_report() -> dict[str, object]:
    listening = live_ports()
    entries: list[dict[str, object]] = []
    for row in iter_port_rows():
        if not row["scope"].startswith("channel:"):
            continue
        if row["visibility"] != "public":
            continue
        if not row["client_visible"]:
            continue
        active, sub_state, enabled = get_unit_state(row["unit"])
        live = int(row["port"]) in listening
        entries.append(
            {
                "scope": row["scope"],
                "name": row["name"],
                "port": int(row["port"]),
                "p2p_port": int(row["p2p_port"]),
                "unit": row["unit"],
                "active": active,
                "sub": sub_state,
                "enabled": enabled,
                "live": live,
            }
        )
    enabled_entries = [entry for entry in entries if entry["enabled"] == "enabled"]
    issues: list[str] = []
    if not enabled_entries:
        issues.append("No client-visible public channel units are enabled.")
    for entry in enabled_entries:
        if entry["active"] != "active":
            issues.append(f"{entry['name']} unit is {entry['active']}/{entry['sub']}.")
        if not entry["live"]:
            issues.append(f"{entry['name']} port {entry['port']} is not listening.")
    ready = not issues
    live_enabled = sum(1 for entry in enabled_entries if entry["live"])
    active_enabled = sum(1 for entry in enabled_entries if entry["active"] == "active")
    return {
        "ready": ready,
        "declared_count": len(entries),
        "enabled_count": len(enabled_entries),
        "active_enabled_count": active_enabled,
        "live_enabled_count": live_enabled,
        "entries": entries,
        "issues": issues,
    }
 def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
    units = [
        channel_inventory.STACK_UNIT,
@@ -622,6 +682,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
    }
    incident_count = count_incident_bundles()
    core_count = len(iter_core_files())
    public_runtime = public_runtime_report()
    payload = {
        "repos": repos,
@@ -648,6 +709,7 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
            "orphan_count": stale_orphan_count,
            "total_count": stale_total_count,
        },
        "public_runtime": public_runtime,
        "core_count": core_count,
        "incident_count": incident_count,
    }
@@ -688,6 +750,15 @@ def print_summary(hours: int, include_smoke: bool, as_json: bool) -> int:
    print("Public Ports")
    print_table(["name", "port", "p2p", "live"], public_port_rows)
    print(
        "client-visible public channels ready: "
        f"{'yes' if public_runtime['ready'] else 'no'} "
        f"({public_runtime['active_enabled_count']}/{public_runtime['enabled_count']} enabled units active, "
        f"{public_runtime['live_enabled_count']}/{public_runtime['enabled_count']} listening)"
    )
    if public_runtime["issues"]:
        for issue in public_runtime["issues"]:
            print(f"warning: {issue}")
    print()
    print(f"Auth ({hours}h)")
@@ -983,6 +1054,41 @@ def print_ports(show_live: bool) -> int:
    return 0
 def print_public_ready(as_json: bool) -> int:
    payload = public_runtime_report()
    if as_json:
        print(json.dumps(payload, indent=2))
    else:
        if payload["entries"]:
            rows = [
                [
                    str(entry["name"]),
                    str(entry["port"]),
                    str(entry["active"]),
                    str(entry["sub"]),
                    str(entry["enabled"]),
                    "yes" if entry["live"] else "no",
                ]
                for entry in payload["entries"]
            ]
            print_table(["name", "port", "active", "sub", "enabled", "live"], rows)
        else:
            print("No client-visible public channels declared in the inventory.")
        print()
        print(
            "ready: "
            f"{'yes' if payload['ready'] else 'no'} "
            f"({payload['active_enabled_count']}/{payload['enabled_count']} enabled units active, "
            f"{payload['live_enabled_count']}/{payload['enabled_count']} listening)"
        )
        for issue in payload["issues"]:
            print(f"issue: {issue}")
    return 0 if payload["ready"] else 1
 def print_cores(as_json: bool) -> int:
    entries = []
    for path in iter_core_files():
@@ -1172,6 +1278,15 @@ def run_wait_ready(timeout_seconds: int, interval_seconds: float) -> int:
    while time.time() < deadline:
        attempt += 1
        print(f"Healthcheck attempt {attempt}...")
        public_runtime = public_runtime_report()
        if not public_runtime["ready"]:
            for issue in public_runtime["issues"]:
                print(f"Public runtime not ready: {issue}")
            remaining = deadline - time.time()
            if remaining <= 0:
                break
            time.sleep(min(interval_seconds, remaining))
            continue
        completed = subprocess.run(
            build_command([str(HEALTHCHECK_PATH), "--mode", "ready"], require_root=True),
            check=False,
@@ -1233,6 +1348,8 @@ def main() -> int:
        return print_status(args.target)
    if args.command == "ports":
        return print_ports(args.live)
    if args.command == "public-ready":
        return print_public_ready(args.json)
    if args.command == "cores":
        return print_cores(args.json)
    if args.command == "incidents":
--- a/deploy/systemd/install_systemd.py
+++ b/deploy/systemd/install_systemd.py
@@ -32,6 +32,11 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--wait-port", type=int, default=9000, help="DB readiness port")
    parser.add_argument("--wait-timeout", type=int, default=30, help="DB readiness timeout in seconds")
    parser.add_argument("--restart", action="store_true", help="Restart metin-server.service after install")
    parser.add_argument(
        "--allow-internal-only",
        action="store_true",
        help="Allow installs that omit every client-visible public channel",
    )
    channel_group = parser.add_mutually_exclusive_group(required=True)
    channel_group.add_argument(
@@ -76,7 +81,7 @@ def copy_file(source: Path, destination: Path, mode: int) -> None:
 def resolve_channels(args: argparse.Namespace) -> list[int]:
    try:
-        return channel_inventory.resolve_selected_channels(
+        selected_channels = channel_inventory.resolve_selected_channels(
            channel_limit=args.channel_limit,
            explicit_channels=args.channels,
        )
@@ -84,6 +89,20 @@ def resolve_channels(args: argparse.Namespace) -> list[int]:
        print(str(exc), file=sys.stderr)
        raise SystemExit(1)
    if not args.allow_internal_only and not channel_inventory.has_public_channel(
        selected_channels,
        client_visible_only=True,
    ):
        print(
            "Selected channels do not include any client-visible public channel. "
            "Add a public channel such as --channel 1, or pass --allow-internal-only "
            "if an auth/internal-only stack is intentional.",
            file=sys.stderr,
        )
        raise SystemExit(1)
    return selected_channels
 def resolve_instances(selected_channels: list[int]) -> list[str]:
    return channel_inventory.get_instances(selected_channels)
--- a/docs/healthchecks.md
+++ b/docs/healthchecks.md
@@ -132,12 +132,15 @@ Useful direct flags:
 Operational CLI:
 ```bash
 metinctl public-ready
 metinctl healthcheck --mode full
 metinctl healthcheck --mode ready
 metinctl wait-ready
 ```
-`metinctl wait-ready` now uses the lighter `ready` mode on purpose. The deeper `full` mode remains available as an explicit admin healthcheck.
+`metinctl public-ready` verifies that every enabled client-visible public channel unit is active and that its declared listener port is actually up.
 `metinctl wait-ready` now first waits for the public runtime to be up and only then runs the lighter `ready` login probe. The deeper `full` mode remains available as an explicit admin healthcheck.
 Example negative auth test:
--- a/docs/server-management.md
+++ b/docs/server-management.md
@@ -41,6 +41,7 @@ The Debian deployment installs:
 - listing managed units
 - checking service status
 - listing declared ports
 - verifying that enabled public client-facing channels are actually up
 - listing recent auth failures
 - listing recent login sessions
 - listing stale open sessions without logout
@@ -78,6 +79,12 @@ Show declared ports and whether they are currently listening:
 metinctl ports --live
 ```
 Verify that enabled client-visible public channels are active and listening:
 ```bash
 metinctl public-ready
 ```
 Show recent real auth failures and skip smoke-test logins:
 ```bash
@@ -219,6 +226,7 @@ It also reconciles enabled game instance units against the selected channels:
 - selected game units are enabled
 - stale game units are disabled
 - if `--restart` is passed, stale game units are disabled with `--now`
 - installs now refuse an auth/internal-only channel selection unless you pass `--allow-internal-only`
 This makes channel enablement declarative instead of depending on whatever happened to be enabled previously.