Try fixing gw worker failures

This commit is contained in:
Rich Chiodo false 2025-12-09 14:36:15 -08:00
parent 0385fe9ab9
commit 991fede562
4 changed files with 56 additions and 8 deletions

View file

@ -1,5 +1,5 @@
[pytest]
testpaths=tests
timeout=60
timeout=120
timeout_method=thread
addopts=-n8

View file

@ -281,7 +281,11 @@ class Session(object):
if self.adapter_endpoints is not None and self.expected_exit_code is not None:
log.info("Waiting for {0} to close listener ports ...", self.adapter_id)
timeout_start = time.time()
while self.adapter_endpoints.check():
if time.time() - timeout_start > 10:
log.warning("{0} listener ports did not close within 10 seconds", self.adapter_id)
break
time.sleep(0.1)
if self.adapter is not None:
@ -290,8 +294,20 @@ class Session(object):
self.adapter_id,
self.adapter.pid,
)
self.adapter.wait()
watchdog.unregister_spawn(self.adapter.pid, self.adapter_id)
try:
self.adapter.wait(timeout=10)
except Exception:
log.warning("{0} did not exit gracefully within 10 seconds, force-killing", self.adapter_id)
try:
self.adapter.kill()
self.adapter.wait(timeout=5)
except Exception as e:
log.error("Failed to force-kill {0}: {1}", self.adapter_id, e)
try:
watchdog.unregister_spawn(self.adapter.pid, self.adapter_id)
except Exception as e:
log.warning("Failed to unregister adapter spawn: {0}", e)
self.adapter = None
if self.backchannel is not None:

View file

@ -17,7 +17,7 @@ from tests.patterns import some
used_ports = set()
def get_test_server_port():
def get_test_server_port(max_retries=10):
"""Returns a server port number that can be safely used for listening without
clashing with another test worker process, when running with pytest-xdist.
@ -27,6 +27,9 @@ def get_test_server_port():
Note that if multiple test workers invoke this function with different ranges
that overlap, conflicts are possible!
Args:
max_retries: Number of times to retry finding an available port
"""
try:
@ -39,11 +42,32 @@ def get_test_server_port():
), "Unrecognized PYTEST_XDIST_WORKER format"
n = int(worker_id[2:])
# Try multiple times to find an available port, with retry logic
for attempt in range(max_retries):
port = 5678 + (n * 300) + attempt
while port in used_ports:
port += 1
# Verify the port is actually available by trying to bind to it
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
sock.bind(("127.0.0.1", port))
sock.close()
used_ports.add(port)
log.info("Allocated port {0} for worker {1}", port, n)
return port
except OSError as e:
log.warning("Port {0} unavailable (attempt {1}/{2}): {3}", port, attempt + 1, max_retries, e)
sock.close()
time.sleep(0.1 * (attempt + 1)) # Exponential backoff
# Fall back to original behavior if all retries fail
port = 5678 + (n * 300)
while port in used_ports:
port += 1
used_ports.add(port)
log.warning("Using fallback port {0} after {1} retries", port, max_retries)
return port

View file

@ -46,19 +46,27 @@ def test_wrapper(request, long_tmpdir):
session.Session.reset_counter()
session.Session.tmpdir = long_tmpdir
# Add worker-specific isolation for tmpdir and log directory
try:
worker_id = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
worker_suffix = f"_{worker_id}"
except Exception:
worker_suffix = ""
session.Session.tmpdir = long_tmpdir / f"session{worker_suffix}"
session.Session.tmpdir.ensure(dir=True)
original_log_dir = log.log_dir
failed = True
try:
if log.log_dir is None:
log.log_dir = (long_tmpdir / "debugpy_logs").strpath
log.log_dir = (long_tmpdir / f"debugpy_logs{worker_suffix}").strpath
else:
log_subdir = request.node.nodeid
log_subdir = log_subdir.replace("::", "/")
for ch in r":?*|<>":
log_subdir = log_subdir.replace(ch, f"&#{ord(ch)};")
log.log_dir += "/" + log_subdir
log.log_dir += "/" + log_subdir + worker_suffix
try:
py.path.local(log.log_dir).remove()