Fix subprocess deadlock with MCP servers via stderr redirection (#103)
Some checks are pending
Lint / lint (push) Waiting to run
Test / test (3.10) (push) Waiting to run
Test / test (3.11) (push) Waiting to run
Test / test (3.12) (push) Waiting to run
Test / test (3.13) (push) Waiting to run

## Summary

Fixes a critical deadlock issue that occurs when MCP servers produce
verbose stderr output. The SDK would hang indefinitely when the stderr
pipe buffer filled up.

## The Problem

The deadlock occurred due to sequential reading of subprocess streams:
1. SDK reads stdout completely before reading stderr
2. When stderr pipe buffer fills (64KB on Linux, 16KB on macOS),
subprocess blocks on write
3. Subprocess can't continue to stdout, parent waits for stdout →
**DEADLOCK** 🔒

## The Solution  

Redirect stderr to a temporary file instead of a pipe:
- **No pipe buffer** = no possibility of deadlock
- Temp file can grow as needed (no 64KB limit)
- Still capture stderr for error reporting (last 100 lines)
- Works consistently across all async backends

## Implementation Details

- `stderr=tempfile.NamedTemporaryFile()` instead of `stderr=PIPE`
- Use `deque(maxlen=100)` to keep only recent stderr lines in memory
- Temp file is automatically cleaned up on disconnect
- Add `[stderr truncated, showing last 100 lines]` message when buffer
is full

## Testing

- Verified no deadlock with 150+ lines of stderr output
- Confirmed stderr is still captured for error reporting
- All existing tests pass
- Works with asyncio, trio, and other anyio backends

## Impact

- Fixes consistent hangs in production with MCP servers
- No functional regression - stderr handling is preserved
- Simpler than concurrent reading alternatives
- More robust than pipe-based solutions

Fixes the issue reported in Slack where SDK would hang indefinitely when
receiving messages from MCP servers with verbose logging.

🤖 Generated with [Claude Code](https://claude.ai/code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Dickson Tsai 2025-07-31 11:42:20 -07:00 committed by GitHub
parent 0de87a2a96
commit fbda510ee4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,6 +4,8 @@ import json
import logging
import os
import shutil
import tempfile
from collections import deque
from collections.abc import AsyncIterable, AsyncIterator
from pathlib import Path
from subprocess import PIPE
@ -46,6 +48,7 @@ class SubprocessCLITransport(Transport):
self._request_counter = 0
self._close_stdin_after_prompt = close_stdin_after_prompt
self._task_group: anyio.abc.TaskGroup | None = None
self._stderr_file: Any = None # tempfile.NamedTemporaryFile
def _find_cli(self) -> str:
"""Find Claude Code CLI binary."""
@ -143,20 +146,24 @@ class SubprocessCLITransport(Transport):
cmd = self._build_command()
try:
# Create a temp file for stderr to avoid pipe buffer deadlock
# We can't use context manager as we need it for the subprocess lifetime
self._stderr_file = tempfile.NamedTemporaryFile( # noqa: SIM115
mode="w+", prefix="claude_stderr_", suffix=".log", delete=False
)
# Enable stdin pipe for both modes (but we'll close it for string mode)
self._process = await anyio.open_process(
cmd,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
stderr=self._stderr_file,
cwd=self._cwd,
env={**os.environ, "CLAUDE_CODE_ENTRYPOINT": "sdk-py"},
)
if self._process.stdout:
self._stdout_stream = TextReceiveStream(self._process.stdout)
if self._process.stderr:
self._stderr_stream = TextReceiveStream(self._process.stderr)
# Handle stdin based on mode
if self._is_streaming:
@ -204,6 +211,15 @@ class SubprocessCLITransport(Transport):
except ProcessLookupError:
pass
# Clean up temp file
if self._stderr_file:
try:
self._stderr_file.close()
Path(self._stderr_file.name).unlink()
except Exception:
pass
self._stderr_file = None
self._process = None
self._stdout_stream = None
self._stderr_stream = None
@ -257,10 +273,6 @@ class SubprocessCLITransport(Transport):
if not self._process or not self._stdout_stream:
raise CLIConnectionError("Not connected")
# Safety constants
max_stderr_size = 10 * 1024 * 1024 # 10MB
stderr_timeout = 30.0 # 30 seconds
json_buffer = ""
# Process stdout messages first
@ -318,36 +330,19 @@ class SubprocessCLITransport(Transport):
# Client disconnected - still need to clean up
pass
# Process stderr with safety limits
stderr_lines = []
stderr_size = 0
if self._stderr_stream:
# Read stderr from temp file (keep only last N lines for memory efficiency)
stderr_lines: deque[str] = deque(maxlen=100) # Keep last 100 lines
if self._stderr_file:
try:
# Use timeout to prevent hanging
with anyio.fail_after(stderr_timeout):
async for line in self._stderr_stream:
line_text = line.strip()
line_size = len(line_text)
# Enforce memory limit
if stderr_size + line_size > max_stderr_size:
stderr_lines.append(
f"[stderr truncated after {stderr_size} bytes]"
)
# Drain rest of stream without storing
async for _ in self._stderr_stream:
pass
break
# Flush any pending writes
self._stderr_file.flush()
# Read from the beginning
self._stderr_file.seek(0)
for line in self._stderr_file:
line_text = line.strip()
if line_text:
stderr_lines.append(line_text)
stderr_size += line_size
except TimeoutError:
stderr_lines.append(
f"[stderr collection timed out after {stderr_timeout}s]"
)
except anyio.ClosedResourceError:
except Exception:
pass
# Check process completion and handle errors
@ -356,7 +351,13 @@ class SubprocessCLITransport(Transport):
except Exception:
returncode = -1
stderr_output = "\n".join(stderr_lines) if stderr_lines else ""
# Convert deque to string for error reporting
stderr_output = "\n".join(list(stderr_lines)) if stderr_lines else ""
if len(stderr_lines) == stderr_lines.maxlen:
stderr_output = (
f"[stderr truncated, showing last {stderr_lines.maxlen} lines]\n"
+ stderr_output
)
# Use exit code for error detection, not string matching
if returncode is not None and returncode != 0: