From b316edbaf954e5b5c2c0ba55b9bae6708a997f0f Mon Sep 17 00:00:00 2001 From: pi Date: Sun, 3 May 2026 21:27:31 +1200 Subject: [PATCH] fix(wifi): stagger driver ws dials and extend initial retry window Co-authored-by: Cursor --- src/models/wifi_ws_clients.py | 63 ++++++++++++++++++++++++++++++----- src/settings.py | 13 ++++++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/models/wifi_ws_clients.py b/src/models/wifi_ws_clients.py index 1143d2a..fdd6a2c 100644 --- a/src/models/wifi_ws_clients.py +++ b/src/models/wifi_ws_clients.py @@ -195,6 +195,27 @@ async def _recv_forward_loop(ip: str, ws) -> None: pass +def _stagger_delay_s_for_ip(ip: str) -> float: + """0 .. wifi_driver_connect_stagger_max_s based on last IPv4 octet (deterministic spread).""" + global _settings + if _settings is None: + return 0.0 + try: + max_s = float(_settings.get("wifi_driver_connect_stagger_max_s", 2.5)) + except (TypeError, ValueError): + max_s = 2.5 + if max_s <= 0: + return 0.0 + parts = str(ip).strip().split(".") + if len(parts) != 4: + return 0.0 + try: + last = int(parts[3]) % 256 + except ValueError: + return 0.0 + return (last / 255.0) * max_s + + async def _driver_connection_loop(ip: str) -> None: global _settings if _settings is None: @@ -204,16 +225,37 @@ async def _driver_connection_loop(ip: str) -> None: if not path.startswith("/"): path = "/" + path uri = f"ws://{ip}:{port}{path}" - retry_interval_s = 2.0 - retry_window_s = 30.0 - deadline = asyncio.get_running_loop().time() + retry_window_s + try: + retry_interval_s = float(_settings.get("wifi_driver_connect_retry_interval_s", 2.0)) + except (TypeError, ValueError): + retry_interval_s = 2.0 + retry_interval_s = max(0.2, retry_interval_s) + try: + retry_window_s = float(_settings.get("wifi_driver_connect_retry_window_s", 120.0)) + except (TypeError, ValueError): + retry_window_s = 120.0 + retry_window_s = max(5.0, retry_window_s) + try: + open_timeout = float(_settings.get("wifi_driver_ws_open_timeout", 45.0)) + except (TypeError, ValueError): + open_timeout = 45.0 + open_timeout = max(5.0, open_timeout) + + loop = asyncio.get_running_loop() + stagger = _stagger_delay_s_for_ip(ip) + if stagger > 0: + await asyncio.sleep(stagger) + + # Only bound boot-time: after we have connected once, keep retrying (Wi-Fi drops, reboots). + connected_once = False + deadline = loop.time() + retry_window_s try: while True: - now = asyncio.get_running_loop().time() - if now >= deadline: + now = loop.time() + if not connected_once and now >= deadline: print( - f"[WS] driver {ip} still unreachable after {int(retry_window_s)}s; " - "stopping retries until next hello" + f"[WS] driver {ip} still unreachable after {int(retry_window_s)}s " + f"(initial window); stopping until next UDP hello / registry prime" ) break try: @@ -222,8 +264,9 @@ async def _driver_connection_loop(ip: str) -> None: uri, ping_interval=20, ping_timeout=15, - open_timeout=30, + open_timeout=open_timeout, ) as ws: + connected_once = True _register_ws(ip, ws) try: await _recv_forward_loop(ip, ws) @@ -239,7 +282,9 @@ async def _driver_connection_loop(ip: str) -> None: n = _unreachable_counts.get(ip, 0) + 1 _unreachable_counts[ip] = n if n == 1 or (n % 30) == 0: - print(f"[WS] driver {ip} unreachable, retry in 2s: {e} (x{n})") + print( + f"[WS] driver {ip} unreachable, retry in {retry_interval_s}s: {e} (x{n})" + ) else: print(f"[WS] driver {ip} session error: {e!r}") traceback.print_exception(type(e), e, e.__traceback__) diff --git a/src/settings.py b/src/settings.py index f7a74a9..43ab9c9 100644 --- a/src/settings.py +++ b/src/settings.py @@ -57,6 +57,19 @@ class Settings(dict): # down (0 disables). Helps drivers that reconnect after seeing traffic on 8766. if 'wifi_driver_hello_interval_s' not in self: self['wifi_driver_hello_interval_s'] = 10.0 + # Outbound WebSocket dial: total seconds to keep trying before first success + # (many devices booting at once need more than a short window). + if 'wifi_driver_connect_retry_window_s' not in self: + self['wifi_driver_connect_retry_window_s'] = 120.0 + # Spread outbound dials 0..N s by device IP so six+ drivers do not all hit the AP at once. + if 'wifi_driver_connect_stagger_max_s' not in self: + self['wifi_driver_connect_stagger_max_s'] = 2.5 + # TCP/WebSocket open timeout per attempt (seconds). + if 'wifi_driver_ws_open_timeout' not in self: + self['wifi_driver_ws_open_timeout'] = 45.0 + # Pause between outbound WebSocket dial attempts (seconds). + if 'wifi_driver_connect_retry_interval_s' not in self: + self['wifi_driver_connect_retry_interval_s'] = 2.0 # UART to ESP32 ESP-NOW bridge; default off (Wi-Fi drivers need no serial). if 'serial_enabled' not in self: self['serial_enabled'] = False