mirror of
https://github.com/anthropics/claude-code-sdk-python.git
synced 2025-12-23 09:19:52 +00:00
Fix subprocess deadlock with MCP servers via stderr redirection (#103)
## Summary Fixes a critical deadlock issue that occurs when MCP servers produce verbose stderr output. The SDK would hang indefinitely when the stderr pipe buffer filled up. ## The Problem The deadlock occurred due to sequential reading of subprocess streams: 1. SDK reads stdout completely before reading stderr 2. When stderr pipe buffer fills (64KB on Linux, 16KB on macOS), subprocess blocks on write 3. Subprocess can't continue to stdout, parent waits for stdout → **DEADLOCK** 🔒 ## The Solution Redirect stderr to a temporary file instead of a pipe: - **No pipe buffer** = no possibility of deadlock - Temp file can grow as needed (no 64KB limit) - Still capture stderr for error reporting (last 100 lines) - Works consistently across all async backends ## Implementation Details - `stderr=tempfile.NamedTemporaryFile()` instead of `stderr=PIPE` - Use `deque(maxlen=100)` to keep only recent stderr lines in memory - Temp file is automatically cleaned up on disconnect - Add `[stderr truncated, showing last 100 lines]` message when buffer is full ## Testing - Verified no deadlock with 150+ lines of stderr output - Confirmed stderr is still captured for error reporting - All existing tests pass - Works with asyncio, trio, and other anyio backends ## Impact - Fixes consistent hangs in production with MCP servers - No functional regression - stderr handling is preserved - Simpler than concurrent reading alternatives - More robust than pipe-based solutions Fixes the issue reported in Slack where SDK would hang indefinitely when receiving messages from MCP servers with verbose logging. 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
0de87a2a96
commit
fbda510ee4
1 changed files with 37 additions and 36 deletions
|
|
@ -4,6 +4,8 @@ import json
|
|||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import deque
|
||||
from collections.abc import AsyncIterable, AsyncIterator
|
||||
from pathlib import Path
|
||||
from subprocess import PIPE
|
||||
|
|
@ -46,6 +48,7 @@ class SubprocessCLITransport(Transport):
|
|||
self._request_counter = 0
|
||||
self._close_stdin_after_prompt = close_stdin_after_prompt
|
||||
self._task_group: anyio.abc.TaskGroup | None = None
|
||||
self._stderr_file: Any = None # tempfile.NamedTemporaryFile
|
||||
|
||||
def _find_cli(self) -> str:
|
||||
"""Find Claude Code CLI binary."""
|
||||
|
|
@ -143,20 +146,24 @@ class SubprocessCLITransport(Transport):
|
|||
|
||||
cmd = self._build_command()
|
||||
try:
|
||||
# Create a temp file for stderr to avoid pipe buffer deadlock
|
||||
# We can't use context manager as we need it for the subprocess lifetime
|
||||
self._stderr_file = tempfile.NamedTemporaryFile( # noqa: SIM115
|
||||
mode="w+", prefix="claude_stderr_", suffix=".log", delete=False
|
||||
)
|
||||
|
||||
# Enable stdin pipe for both modes (but we'll close it for string mode)
|
||||
self._process = await anyio.open_process(
|
||||
cmd,
|
||||
stdin=PIPE,
|
||||
stdout=PIPE,
|
||||
stderr=PIPE,
|
||||
stderr=self._stderr_file,
|
||||
cwd=self._cwd,
|
||||
env={**os.environ, "CLAUDE_CODE_ENTRYPOINT": "sdk-py"},
|
||||
)
|
||||
|
||||
if self._process.stdout:
|
||||
self._stdout_stream = TextReceiveStream(self._process.stdout)
|
||||
if self._process.stderr:
|
||||
self._stderr_stream = TextReceiveStream(self._process.stderr)
|
||||
|
||||
# Handle stdin based on mode
|
||||
if self._is_streaming:
|
||||
|
|
@ -204,6 +211,15 @@ class SubprocessCLITransport(Transport):
|
|||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
# Clean up temp file
|
||||
if self._stderr_file:
|
||||
try:
|
||||
self._stderr_file.close()
|
||||
Path(self._stderr_file.name).unlink()
|
||||
except Exception:
|
||||
pass
|
||||
self._stderr_file = None
|
||||
|
||||
self._process = None
|
||||
self._stdout_stream = None
|
||||
self._stderr_stream = None
|
||||
|
|
@ -257,10 +273,6 @@ class SubprocessCLITransport(Transport):
|
|||
if not self._process or not self._stdout_stream:
|
||||
raise CLIConnectionError("Not connected")
|
||||
|
||||
# Safety constants
|
||||
max_stderr_size = 10 * 1024 * 1024 # 10MB
|
||||
stderr_timeout = 30.0 # 30 seconds
|
||||
|
||||
json_buffer = ""
|
||||
|
||||
# Process stdout messages first
|
||||
|
|
@ -318,36 +330,19 @@ class SubprocessCLITransport(Transport):
|
|||
# Client disconnected - still need to clean up
|
||||
pass
|
||||
|
||||
# Process stderr with safety limits
|
||||
stderr_lines = []
|
||||
stderr_size = 0
|
||||
|
||||
if self._stderr_stream:
|
||||
# Read stderr from temp file (keep only last N lines for memory efficiency)
|
||||
stderr_lines: deque[str] = deque(maxlen=100) # Keep last 100 lines
|
||||
if self._stderr_file:
|
||||
try:
|
||||
# Use timeout to prevent hanging
|
||||
with anyio.fail_after(stderr_timeout):
|
||||
async for line in self._stderr_stream:
|
||||
line_text = line.strip()
|
||||
line_size = len(line_text)
|
||||
|
||||
# Enforce memory limit
|
||||
if stderr_size + line_size > max_stderr_size:
|
||||
stderr_lines.append(
|
||||
f"[stderr truncated after {stderr_size} bytes]"
|
||||
)
|
||||
# Drain rest of stream without storing
|
||||
async for _ in self._stderr_stream:
|
||||
pass
|
||||
break
|
||||
|
||||
# Flush any pending writes
|
||||
self._stderr_file.flush()
|
||||
# Read from the beginning
|
||||
self._stderr_file.seek(0)
|
||||
for line in self._stderr_file:
|
||||
line_text = line.strip()
|
||||
if line_text:
|
||||
stderr_lines.append(line_text)
|
||||
stderr_size += line_size
|
||||
|
||||
except TimeoutError:
|
||||
stderr_lines.append(
|
||||
f"[stderr collection timed out after {stderr_timeout}s]"
|
||||
)
|
||||
except anyio.ClosedResourceError:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check process completion and handle errors
|
||||
|
|
@ -356,7 +351,13 @@ class SubprocessCLITransport(Transport):
|
|||
except Exception:
|
||||
returncode = -1
|
||||
|
||||
stderr_output = "\n".join(stderr_lines) if stderr_lines else ""
|
||||
# Convert deque to string for error reporting
|
||||
stderr_output = "\n".join(list(stderr_lines)) if stderr_lines else ""
|
||||
if len(stderr_lines) == stderr_lines.maxlen:
|
||||
stderr_output = (
|
||||
f"[stderr truncated, showing last {stderr_lines.maxlen} lines]\n"
|
||||
+ stderr_output
|
||||
)
|
||||
|
||||
# Use exit code for error detection, not string matching
|
||||
if returncode is not None and returncode != 0:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue