fix(wifi): stagger driver ws dials and extend initial retry window

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
pi
2026-05-03 21:27:31 +12:00
parent c1b0c41ef2
commit b316edbaf9
2 changed files with 67 additions and 9 deletions

View File

@@ -195,6 +195,27 @@ async def _recv_forward_loop(ip: str, ws) -> None:
pass
def _stagger_delay_s_for_ip(ip: str) -> float:
"""0 .. wifi_driver_connect_stagger_max_s based on last IPv4 octet (deterministic spread)."""
global _settings
if _settings is None:
return 0.0
try:
max_s = float(_settings.get("wifi_driver_connect_stagger_max_s", 2.5))
except (TypeError, ValueError):
max_s = 2.5
if max_s <= 0:
return 0.0
parts = str(ip).strip().split(".")
if len(parts) != 4:
return 0.0
try:
last = int(parts[3]) % 256
except ValueError:
return 0.0
return (last / 255.0) * max_s
async def _driver_connection_loop(ip: str) -> None:
global _settings
if _settings is None:
@@ -204,16 +225,37 @@ async def _driver_connection_loop(ip: str) -> None:
if not path.startswith("/"):
path = "/" + path
uri = f"ws://{ip}:{port}{path}"
try:
retry_interval_s = float(_settings.get("wifi_driver_connect_retry_interval_s", 2.0))
except (TypeError, ValueError):
retry_interval_s = 2.0
retry_window_s = 30.0
deadline = asyncio.get_running_loop().time() + retry_window_s
retry_interval_s = max(0.2, retry_interval_s)
try:
retry_window_s = float(_settings.get("wifi_driver_connect_retry_window_s", 120.0))
except (TypeError, ValueError):
retry_window_s = 120.0
retry_window_s = max(5.0, retry_window_s)
try:
open_timeout = float(_settings.get("wifi_driver_ws_open_timeout", 45.0))
except (TypeError, ValueError):
open_timeout = 45.0
open_timeout = max(5.0, open_timeout)
loop = asyncio.get_running_loop()
stagger = _stagger_delay_s_for_ip(ip)
if stagger > 0:
await asyncio.sleep(stagger)
# Only bound boot-time: after we have connected once, keep retrying (Wi-Fi drops, reboots).
connected_once = False
deadline = loop.time() + retry_window_s
try:
while True:
now = asyncio.get_running_loop().time()
if now >= deadline:
now = loop.time()
if not connected_once and now >= deadline:
print(
f"[WS] driver {ip} still unreachable after {int(retry_window_s)}s; "
"stopping retries until next hello"
f"[WS] driver {ip} still unreachable after {int(retry_window_s)}s "
f"(initial window); stopping until next UDP hello / registry prime"
)
break
try:
@@ -222,8 +264,9 @@ async def _driver_connection_loop(ip: str) -> None:
uri,
ping_interval=20,
ping_timeout=15,
open_timeout=30,
open_timeout=open_timeout,
) as ws:
connected_once = True
_register_ws(ip, ws)
try:
await _recv_forward_loop(ip, ws)
@@ -239,7 +282,9 @@ async def _driver_connection_loop(ip: str) -> None:
n = _unreachable_counts.get(ip, 0) + 1
_unreachable_counts[ip] = n
if n == 1 or (n % 30) == 0:
print(f"[WS] driver {ip} unreachable, retry in 2s: {e} (x{n})")
print(
f"[WS] driver {ip} unreachable, retry in {retry_interval_s}s: {e} (x{n})"
)
else:
print(f"[WS] driver {ip} session error: {e!r}")
traceback.print_exception(type(e), e, e.__traceback__)

View File

@@ -57,6 +57,19 @@ class Settings(dict):
# down (0 disables). Helps drivers that reconnect after seeing traffic on 8766.
if 'wifi_driver_hello_interval_s' not in self:
self['wifi_driver_hello_interval_s'] = 10.0
# Outbound WebSocket dial: total seconds to keep trying before first success
# (many devices booting at once need more than a short window).
if 'wifi_driver_connect_retry_window_s' not in self:
self['wifi_driver_connect_retry_window_s'] = 120.0
# Spread outbound dials 0..N s by device IP so six+ drivers do not all hit the AP at once.
if 'wifi_driver_connect_stagger_max_s' not in self:
self['wifi_driver_connect_stagger_max_s'] = 2.5
# TCP/WebSocket open timeout per attempt (seconds).
if 'wifi_driver_ws_open_timeout' not in self:
self['wifi_driver_ws_open_timeout'] = 45.0
# Pause between outbound WebSocket dial attempts (seconds).
if 'wifi_driver_connect_retry_interval_s' not in self:
self['wifi_driver_connect_retry_interval_s'] = 2.0
# UART to ESP32 ESP-NOW bridge; default off (Wi-Fi drivers need no serial).
if 'serial_enabled' not in self:
self['serial_enabled'] = False