Fix subprocess deadlock with MCP servers by redirecting stderr to temp file

The SDK was experiencing deadlocks when MCP servers produced verbose stderr
output. The issue occurred because:
1. The SDK read stdout and stderr sequentially (stdout first, then stderr)
2. When stderr buffer filled up (64KB on Linux, 16KB on macOS), the subprocess
   would block trying to write more to stderr
3. Since the subprocess was blocked, it couldn't write to stdout
4. The parent process was waiting for stdout, creating a deadlock

This fix redirects stderr to a temporary file instead of a pipe, which:
- Eliminates the pipe buffer limitation (files have no such restriction)
- Prevents any possibility of deadlock
- Still captures stderr for error reporting (keeping last 100 lines)
- Works consistently across all async backends (asyncio, trio, etc.)

The temp file is automatically cleaned up when the subprocess ends, and we
use a circular buffer (deque) to keep only the last 100 lines in memory for
error reporting purposes.

Fixes deadlock issue reported in Slack where SDK would hang indefinitely
when receiving messages from MCP servers.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Dickson Tsai 2025-07-31 11:01:38 -07:00
parent fa3962de3f
commit 27b49bcdca
No known key found for this signature in database

View file

@ -4,6 +4,8 @@ import json
import logging
import os
import shutil
import tempfile
from collections import deque
from collections.abc import AsyncIterable, AsyncIterator
from pathlib import Path
from subprocess import PIPE
@ -46,6 +48,7 @@ class SubprocessCLITransport(Transport):
self._request_counter = 0
self._close_stdin_after_prompt = close_stdin_after_prompt
self._task_group: anyio.abc.TaskGroup | None = None
self._stderr_file: Any = None # tempfile.NamedTemporaryFile
def _find_cli(self) -> str:
"""Find Claude Code CLI binary."""
@ -143,20 +146,24 @@ class SubprocessCLITransport(Transport):
cmd = self._build_command()
try:
# Create a temp file for stderr to avoid pipe buffer deadlock
# We can't use context manager as we need it for the subprocess lifetime
self._stderr_file = tempfile.NamedTemporaryFile( # noqa: SIM115
mode='w+', prefix='claude_stderr_', suffix='.log', delete=False
)
# Enable stdin pipe for both modes (but we'll close it for string mode)
self._process = await anyio.open_process(
cmd,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
stderr=self._stderr_file,
cwd=self._cwd,
env={**os.environ, "CLAUDE_CODE_ENTRYPOINT": "sdk-py"},
)
if self._process.stdout:
self._stdout_stream = TextReceiveStream(self._process.stdout)
if self._process.stderr:
self._stderr_stream = TextReceiveStream(self._process.stderr)
# Handle stdin based on mode
if self._is_streaming:
@ -204,6 +211,15 @@ class SubprocessCLITransport(Transport):
except ProcessLookupError:
pass
# Clean up temp file
if self._stderr_file:
try:
self._stderr_file.close()
Path(self._stderr_file.name).unlink()
except Exception:
pass
self._stderr_file = None
self._process = None
self._stdout_stream = None
self._stderr_stream = None
@ -257,10 +273,6 @@ class SubprocessCLITransport(Transport):
if not self._process or not self._stdout_stream:
raise CLIConnectionError("Not connected")
# Safety constants
max_stderr_size = 10 * 1024 * 1024 # 10MB
stderr_timeout = 30.0 # 30 seconds
json_buffer = ""
# Process stdout messages first
@ -318,36 +330,19 @@ class SubprocessCLITransport(Transport):
# Client disconnected - still need to clean up
pass
# Process stderr with safety limits
stderr_lines = []
stderr_size = 0
if self._stderr_stream:
# Read stderr from temp file (keep only last N lines for memory efficiency)
stderr_lines: deque[str] = deque(maxlen=100) # Keep last 100 lines
if self._stderr_file:
try:
# Use timeout to prevent hanging
with anyio.fail_after(stderr_timeout):
async for line in self._stderr_stream:
line_text = line.strip()
line_size = len(line_text)
# Enforce memory limit
if stderr_size + line_size > max_stderr_size:
stderr_lines.append(
f"[stderr truncated after {stderr_size} bytes]"
)
# Drain rest of stream without storing
async for _ in self._stderr_stream:
pass
break
# Flush any pending writes
self._stderr_file.flush()
# Read from the beginning
self._stderr_file.seek(0)
for line in self._stderr_file:
line_text = line.strip()
if line_text:
stderr_lines.append(line_text)
stderr_size += line_size
except TimeoutError:
stderr_lines.append(
f"[stderr collection timed out after {stderr_timeout}s]"
)
except anyio.ClosedResourceError:
except Exception:
pass
# Check process completion and handle errors
@ -356,7 +351,10 @@ class SubprocessCLITransport(Transport):
except Exception:
returncode = -1
stderr_output = "\n".join(stderr_lines) if stderr_lines else ""
# Convert deque to string for error reporting
stderr_output = "\n".join(list(stderr_lines)) if stderr_lines else ""
if len(stderr_lines) == stderr_lines.maxlen:
stderr_output = f"[stderr truncated, showing last {stderr_lines.maxlen} lines]\n" + stderr_output
# Use exit code for error detection, not string matching
if returncode is not None and returncode != 0: