refactor(proxy): audit thinking-mode protocol and refactor test suite (#33)

2026-05-01 19:48:08 +08:00 · 2026-05-01 19:48:08 +08:00 · be0310751c
parent b65f0dd8a2
commit be0310751c
14 changed files with 2223 additions and 2894 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
+# AIs
+.claude/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
--- a/README.md
+++ b/README.md
@ -134,7 +134,7 @@ Select `deepseek-v4-pro` in Cursor and use chat or agent mode as usual.

 ## How It Works

- **Core fix:** DeepSeek's [thinking mode](https://api-docs.deepseek.com/guides/thinking_mode#tool-calls) requires `reasoning_content` from assistant tool-call messages to be passed back in subsequent requests, but Cursor omits this field, causing a 400 error. The proxy (`Cursor → ngrok → proxy → DeepSeek API`) stores `reasoning_content` from every DeepSeek response in a local SQLite cache, keyed by message signature, tool-call ID, and tool-call function signature, and patches outgoing requests with missing `reasoning_content` before they reach DeepSeek. On a cold cache (proxy restart, model switch), it logs and drops unrecoverable history, continues from the latest user request, and prefixes the next Cursor response with a notice.
+- **Core fix:** DeepSeek [thinking-mode tool calls](https://api-docs.deepseek.com/guides/thinking_mode#tool-calls) require the complete **multi-round** `reasoning_content` chain to be sent back in later requests. Cursor omits that field, causing a 400 error. The proxy (`Cursor -> ngrok -> proxy -> DeepSeek API`) stores DeepSeek's original `reasoning_content` and patches missing blocks back into outgoing tool-call history.
 - **Multi-conversation isolation:** To avoid collisions across concurrent conversations, the proxy scopes cache keys by a SHA-256 hash of the canonical conversation prefix (roles, content, and tool calls, excluding `reasoning_content`) plus the upstream model, configuration, and an API-key hash. Different threads get different scopes, so reused tool-call IDs do not collide. Byte-identical cloned histories produce identical scopes.
 - **Context caching compatibility:** The proxy preserves compatibility by never injecting synthetic thread IDs, timestamps, or cache-control messages. It restores `reasoning_content` as the exact original string, so repeated prefixes remain intact for [DeepSeek context cache](https://api-docs.deepseek.com/guides/kv_cache). Cache hit rates are logged in the terminal output.
 - **Additional compatibility fixes:** Beyond reasoning repair, the proxy converts legacy `functions`/`function_call` fields to `tools`/`tool_choice`, preserves required and named tool-choice semantics, normalizes `reasoning_effort` aliases, strips mirrored thinking display blocks from assistant content, flattens multi-part content arrays to plain text, and mirrors `reasoning_content` into Cursor-visible Markdown details blocks.
--- a/src/deepseek_cursor_proxy/config.py
+++ b/src/deepseek_cursor_proxy/config.py
@ -172,8 +172,6 @@ def settings_from_config(

 def normalize_thinking(value: Any) -> str:
    thinking = as_str(value, DEFAULT_THINKING).strip().lower()
-    if thinking in {"passthrough", "pass-through", "pass_through"}:
-        return "pass-through"
    if thinking in {"enabled", "disabled"}:
        return thinking
    return DEFAULT_THINKING
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@ -540,6 +540,8 @@ class DeepSeekProxyHandler(BaseHTTPRequestHandler):
                scope=record_response_scope,
                prior_messages=record_response_messages,
                recording_contexts=record_response_contexts,
+                display_reasoning=self.config.display_reasoning,
+                collapsible_reasoning=self.config.collapsible_reasoning,
            )
        except (json.JSONDecodeError, UnicodeDecodeError) as exc:
            LOG.warning("failed to rewrite upstream JSON response: %s", exc)
@ -812,7 +814,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
    )
    parser.add_argument(
        "--thinking",
-        choices=["enabled", "disabled", "pass-through"],
+        choices=["enabled", "disabled"],
        help="DeepSeek thinking mode, default from config or enabled",
    )
    parser.add_argument(
--- a/src/deepseek_cursor_proxy/streaming.py
+++ b/src/deepseek_cursor_proxy/streaming.py
@ -292,3 +292,34 @@ class CursorReasoningDisplayAdapter:
        }
        if metadata:
            self._last_chunk_metadata.update(metadata)
+
+
+def fold_reasoning_into_content(
+    response_payload: dict[str, Any],
+    collapsible: bool,
+) -> None:
+    """Mirror `reasoning_content` into the visible `content` field for
+    non-streaming responses, matching the streaming `<details>` layout."""
+    block_start = (
+        COLLAPSIBLE_THINKING_BLOCK_START if collapsible else THINKING_BLOCK_START
+    )
+    block_end = COLLAPSIBLE_THINKING_BLOCK_END if collapsible else THINKING_BLOCK_END
+    choices = response_payload.get("choices")
+    if not isinstance(choices, list):
+        return
+    for choice in choices:
+        if not isinstance(choice, dict):
+            continue
+        message = choice.get("message")
+        if not isinstance(message, dict):
+            continue
+        reasoning = message.get("reasoning_content")
+        if not isinstance(reasoning, str) or not reasoning:
+            continue
+        content = message.get("content")
+        message["content"] = (
+            block_start
+            + reasoning
+            + block_end
+            + (content if isinstance(content, str) else "")
+        )
--- a/src/deepseek_cursor_proxy/transform.py
+++ b/src/deepseek_cursor_proxy/transform.py
@ -3,6 +3,7 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 import hashlib
 import json
+import logging
 import re
 from typing import Any

@ -15,6 +16,10 @@ from .reasoning_store import (
    tool_call_signature,
    turn_context_signature,
 )
+from .streaming import fold_reasoning_into_content
+
+
+LOG = logging.getLogger("deepseek_cursor_proxy")


 SUPPORTED_REQUEST_FIELDS = {
@ -35,6 +40,13 @@ SUPPORTED_REQUEST_FIELDS = {
    "frequency_penalty",
    "logprobs",
    "top_logprobs",
+    # Standard OpenAI Chat Completions fields that DeepSeek either honors or
+    # safely ignores. Cursor and most OpenAI SDKs send these unconditionally,
+    # so forwarding keeps clients happy and avoids log spam.
+    "user",
+    "seed",
+    "n",
+    "logit_bias",
 }

 MESSAGE_FIELDS = {
@ -83,10 +95,6 @@ CURSOR_THINKING_BLOCK_RE = re.compile(
 )

 RECOVERY_NOTICE_TEXT = "[deepseek-cursor-proxy] Refreshed reasoning_content history."
-LEGACY_RECOVERY_NOTICE_TEXT = (
-    "Note: recovered this DeepSeek chat because older tool-call reasoning "
-    "was unavailable; continuing with recent context only."
-)
 RECOVERY_NOTICE_CONTENT = f"{RECOVERY_NOTICE_TEXT}\n\n"
 RECOVERY_SYSTEM_CONTENT = (
    "deepseek-cursor-proxy recovered this request because older DeepSeek "
@ -460,10 +468,33 @@ def has_recovery_notice(message: dict[str, Any]) -> bool:
    return (
        message.get("role") == "assistant"
        and isinstance(content, str)
-        and content.startswith((RECOVERY_NOTICE_TEXT, LEGACY_RECOVERY_NOTICE_TEXT))
+        and content.startswith(RECOVERY_NOTICE_TEXT)
    )


+def strip_recovery_notice_for_upstream(
+    messages: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Cursor echoes the proxy's recovery notice back to us in later turns.
+    The notice serves as a boundary marker for the proxy, but DeepSeek must
+    not see proxy-generated prose. Return a copy with assistant prefixes
+    stripped; leave the input untouched so cache scopes/recording contexts
+    keep matching the with-prefix history that Cursor will send next time."""
+    stripped: list[dict[str, Any]] = []
+    for message in messages:
+        if message.get("role") != "assistant":
+            stripped.append(message)
+            continue
+        content = message.get("content")
+        if not isinstance(content, str) or not content.startswith(RECOVERY_NOTICE_TEXT):
+            stripped.append(message)
+            continue
+        cleaned = dict(message)
+        cleaned["content"] = content[len(RECOVERY_NOTICE_TEXT) :].lstrip("\r\n")
+        stripped.append(cleaned)
+    return stripped
+
+
 def leading_system_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
    leading_messages: list[dict[str, Any]] = []
    for message in messages:
@ -628,6 +659,11 @@ def assistant_needs_reasoning_for_tool_context(
 def upstream_model_for(original_model: str, config: ProxyConfig) -> str:
    if original_model.startswith("deepseek-"):
        return original_model
+    LOG.warning(
+        "rewriting non-DeepSeek model %r to configured fallback %r",
+        original_model,
+        config.upstream_model,
+    )
    return config.upstream_model


@ -688,6 +724,16 @@ def prepare_upstream_request(
    prepared = {
        key: value for key, value in payload.items() if key in SUPPORTED_REQUEST_FIELDS
    }
+    dropped_fields = sorted(
+        key
+        for key in payload.keys()
+        if key not in SUPPORTED_REQUEST_FIELDS
+        and key not in {"max_completion_tokens", "functions", "function_call"}
+    )
+    if dropped_fields:
+        LOG.warning(
+            "dropping unsupported request field(s): %s", ", ".join(dropped_fields)
+        )
    if "max_tokens" not in prepared and "max_completion_tokens" in payload:
        prepared["max_tokens"] = payload["max_completion_tokens"]

@ -719,14 +765,9 @@ def prepare_upstream_request(
        if tool_choice is not None:
            prepared["tool_choice"] = tool_choice

-    if config.thinking != "pass-through":
-        prepared["thinking"] = {"type": config.thinking}
-
-    thinking = prepared.get("thinking")
-    thinking_enabled = isinstance(thinking, dict) and thinking.get("type") == "enabled"
-    thinking_disabled = (
-        isinstance(thinking, dict) and thinking.get("type") == "disabled"
-    )
+    prepared["thinking"] = {"type": config.thinking}
+    thinking_enabled = config.thinking == "enabled"
+    thinking_disabled = config.thinking == "disabled"
    if thinking_enabled:
        prepared["reasoning_effort"] = normalize_reasoning_effort(
            prepared.get("reasoning_effort") or config.reasoning_effort
@ -797,12 +838,12 @@ def prepare_upstream_request(
            keep_reasoning=not thinking_disabled,
        )
        reasoning_diagnostics.extend(latest_diagnostics)
-    prepared["messages"] = messages
    active_record_response_scope = conversation_scope(messages, cache_namespace)
    record_response_contexts = response_recording_contexts(
        (record_response_scope, record_response_messages),
        (active_record_response_scope, messages),
    )
+    prepared["messages"] = strip_recovery_notice_for_upstream(messages)

    return PreparedRequest(
        payload=prepared,
@ -874,6 +915,8 @@ def rewrite_response_body(
    scope: str | None = None,
    prior_messages: list[dict[str, Any]] | None = None,
    recording_contexts: list[tuple[str, list[dict[str, Any]]]] | None = None,
+    display_reasoning: bool = False,
+    collapsible_reasoning: bool = True,
 ) -> bytes:
    response_payload = json.loads(body.decode("utf-8"))
    if isinstance(response_payload, dict):
@ -888,6 +931,8 @@ def rewrite_response_body(
            prior_messages=prior_messages,
            recording_contexts=recording_contexts,
        )
+        if display_reasoning:
+            fold_reasoning_into_content(response_payload, collapsible_reasoning)
        if "model" in response_payload:
            response_payload["model"] = original_model
    return json.dumps(
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -121,7 +121,7 @@ class ConfigTests(unittest.TestCase):
                    [
                        "base_url: https://example.com/v1/",
                        "model: deepseek-v4-flash",
-                        "thinking: pass_through",
+                        "thinking: disabled",
                        "reasoning_effort: max",
                        "port: 9100",
                        "host: 0.0.0.0",
@ -145,7 +145,7 @@ class ConfigTests(unittest.TestCase):

        self.assertEqual(config.upstream_base_url, "https://example.com/v1")
        self.assertEqual(config.upstream_model, "deepseek-v4-flash")
-        self.assertEqual(config.thinking, "pass-through")
+        self.assertEqual(config.thinking, "disabled")
        self.assertEqual(config.reasoning_effort, "max")
        self.assertEqual(config.host, "0.0.0.0")
        self.assertEqual(config.port, 9100)
--- a/tests/test_live_deepseek_cursor_proxy.py
+++ b/tests/test_live_deepseek_cursor_proxy.py
--- a/tests/test_protocol.py
+++ b/tests/test_protocol.py
--- a/tests/test_proxy_end_to_end.py
+++ b/tests/test_proxy_end_to_end.py
--- a/tests/test_server.py
+++ b/tests/test_server.py
@ -1,24 +1,45 @@
+"""Server boundary, CLI, and operational tests.
+
+Pure helper tests (gzip, summarize) and stub-handler tests (client
+disconnect) live near the top. The bottom of the file boots a real proxy +
+tiny upstream to exercise things that need the HTTP layer: bearer token
+forwarding, oversized body, missing-bearer rejection, logging modes, and
+streaming connection close.
+"""
+
 from __future__ import annotations

+from dataclasses import replace
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from io import BytesIO
 import gzip
 import json
 from pathlib import Path
+import threading
+import time
 from types import SimpleNamespace
 import unittest
 import zlib
+from urllib.error import HTTPError
+from urllib.request import Request, urlopen

 from deepseek_cursor_proxy.config import ProxyConfig
 from deepseek_cursor_proxy.reasoning_store import ReasoningStore
 from deepseek_cursor_proxy.server import (
    DeepSeekProxyHandler,
+    DeepSeekProxyServer,
    build_arg_parser,
    read_response_body,
    summarize_chat_payload,
 )


-class FakeResponse:
+# ---------------------------------------------------------------------------
+# Stubs for fast in-process tests of internal handler methods
+# ---------------------------------------------------------------------------
+
+
+class _FakeResponse:
    def __init__(self, body: bytes, encoding: str = "", status: int = 200) -> None:
        self._body = BytesIO(body)
        self.headers = {"Content-Encoding": encoding} if encoding else {}
@ -28,7 +49,7 @@ class FakeResponse:
        return self._body.read()


-class FakeStreamingResponse:
+class _FakeStreamingResponse:
    status = 200
    headers = {"Content-Type": "text/event-stream"}

@ -43,7 +64,7 @@ class FakeStreamingResponse:
        return self._lines.pop(0)


-class FailingStreamingResponse:
+class _FailingStreamingResponse:
    status = 200
    headers = {"Content-Type": "text/event-stream"}

@ -51,7 +72,7 @@ class FailingStreamingResponse:
        raise OSError("record layer failure")


-class BrokenPipeWfile:
+class _BrokenPipeWfile:
    def write(self, body: bytes) -> None:
        raise BrokenPipeError("test disconnect")

@ -59,10 +80,10 @@ class BrokenPipeWfile:
        raise BrokenPipeError("test disconnect")


-def make_proxy_handler(wfile: object) -> DeepSeekProxyHandler:
+def _make_handler_stub(wfile: object, **config: object) -> DeepSeekProxyHandler:
    handler = object.__new__(DeepSeekProxyHandler)
    handler.server = SimpleNamespace(
-        config=ProxyConfig(),
+        config=ProxyConfig(**config),
        reasoning_store=ReasoningStore(":memory:"),
    )
    handler.wfile = wfile
@ -73,8 +94,13 @@ def make_proxy_handler(wfile: object) -> DeepSeekProxyHandler:
    return handler


-class ServerTests(unittest.TestCase):
-    def test_cli_boolean_overrides_have_on_and_off_forms(self) -> None:
+# ---------------------------------------------------------------------------
+# CLI / pure helpers
+# ---------------------------------------------------------------------------
+
+
+class CliAndHelperTests(unittest.TestCase):
+    def test_cli_boolean_flags_have_on_and_off_forms(self) -> None:
        args = build_arg_parser().parse_args(
            [
                "--no-ngrok",
@ -86,7 +112,6 @@ class ServerTests(unittest.TestCase):
                "/tmp/dcp-traces",
            ]
        )
-
        self.assertFalse(args.ngrok)
        self.assertFalse(args.verbose)
        self.assertFalse(args.display_reasoning)
@ -94,19 +119,17 @@ class ServerTests(unittest.TestCase):
        self.assertTrue(args.cors)
        self.assertEqual(args.trace_dir, Path("/tmp/dcp-traces"))

-    def test_read_response_body_handles_gzip(self) -> None:
-        body = gzip.compress(b'{"ok":true}')
-
-        self.assertEqual(read_response_body(FakeResponse(body, "gzip")), b'{"ok":true}')
-
-    def test_read_response_body_handles_deflate(self) -> None:
-        body = zlib.compress(b'{"ok":true}')
-
+    def test_read_response_body_decodes_gzip_and_deflate(self) -> None:
        self.assertEqual(
-            read_response_body(FakeResponse(body, "deflate")), b'{"ok":true}'
+            read_response_body(_FakeResponse(gzip.compress(b'{"ok":1}'), "gzip")),
+            b'{"ok":1}',
+        )
+        self.assertEqual(
+            read_response_body(_FakeResponse(zlib.compress(b'{"ok":1}'), "deflate")),
+            b'{"ok":1}',
        )

-    def test_summarize_chat_payload_does_not_include_message_content(self) -> None:
+    def test_summarize_chat_payload_omits_message_content(self) -> None:
        summary = summarize_chat_payload(
            {
                "model": "deepseek-v4-pro",
@ -116,18 +139,22 @@ class ServerTests(unittest.TestCase):
                "tool_choice": "auto",
            }
        )
-
        self.assertIn("model='deepseek-v4-pro'", summary)
-        self.assertIn("stream=True", summary)
        self.assertIn("messages=1", summary)
-        self.assertIn("tools=1", summary)
        self.assertNotIn("secret prompt", summary)

+
+# ---------------------------------------------------------------------------
+# Client-disconnect / upstream-failure stubs (no real HTTP needed)
+# ---------------------------------------------------------------------------
+
+
+class HandlerStubTests(unittest.TestCase):
    def test_regular_response_handles_client_disconnect(self) -> None:
-        handler = make_proxy_handler(BrokenPipeWfile())
+        handler = _make_handler_stub(_BrokenPipeWfile())
        body = json.dumps(
            {
-                "id": "chatcmpl-test",
+                "id": "x",
                "object": "chat.completion",
                "model": "deepseek-v4-pro",
                "choices": [
@ -139,116 +166,324 @@ class ServerTests(unittest.TestCase):
                ],
            }
        ).encode("utf-8")
-
        try:
            with self.assertLogs("deepseek_cursor_proxy", level="WARNING") as captured:
-                sent = handler._proxy_regular_response(
-                    FakeResponse(body),
+                result = handler._proxy_regular_response(
+                    _FakeResponse(body),
                    "deepseek-v4-pro",
                    [{"role": "user", "content": "hi"}],
-                    "cache-namespace",
+                    "ns",
                )
        finally:
            handler.server.reasoning_store.close()
-
-        self.assertFalse(sent.sent)
+        self.assertFalse(result.sent)
        self.assertIn("sending upstream response body", "\n".join(captured.output))

    def test_streaming_response_stops_on_client_disconnect(self) -> None:
-        handler = make_proxy_handler(BrokenPipeWfile())
+        handler = _make_handler_stub(_BrokenPipeWfile())
        chunk = {
-            "id": "chatcmpl-stream",
+            "id": "stream",
            "model": "deepseek-v4-pro",
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"role": "assistant", "content": "hello"},
-                }
-            ],
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": "hi"}}],
        }
-        response = FakeStreamingResponse(
+        response = _FakeStreamingResponse(
            [
                f"data: {json.dumps(chunk)}\n\n".encode("utf-8"),
                b"data: [DONE]\n\n",
            ]
        )
-
        try:
            with self.assertLogs("deepseek_cursor_proxy", level="WARNING") as captured:
-                sent = handler._proxy_streaming_response(
+                result = handler._proxy_streaming_response(
                    response,
                    "deepseek-v4-pro",
                    [{"role": "user", "content": "hi"}],
-                    "cache-namespace",
+                    "ns",
                )
        finally:
            handler.server.reasoning_store.close()
-
-        self.assertFalse(sent.sent)
+        self.assertFalse(result.sent)
        self.assertEqual(response.readline_calls, 1)
        self.assertIn("sending streaming response chunk", "\n".join(captured.output))

    def test_streaming_response_handles_upstream_read_failure(self) -> None:
-        handler = make_proxy_handler(BytesIO())
-
+        handler = _make_handler_stub(BytesIO())
        try:
            with self.assertLogs("deepseek_cursor_proxy", level="WARNING") as captured:
-                sent = handler._proxy_streaming_response(
-                    FailingStreamingResponse(),
+                result = handler._proxy_streaming_response(
+                    _FailingStreamingResponse(),
                    "deepseek-v4-pro",
                    [{"role": "user", "content": "hi"}],
-                    "cache-namespace",
+                    "ns",
                )
        finally:
            handler.server.reasoning_store.close()
-
-        self.assertFalse(sent.sent)
+        self.assertFalse(result.sent)
        self.assertIn(
-            "upstream streaming response read failed",
-            "\n".join(captured.output),
+            "upstream streaming response read failed", "\n".join(captured.output)
        )

-    def test_collapsible_reasoning_has_no_effect_when_display_is_disabled(
-        self,
-    ) -> None:
+    def test_collapsible_reasoning_no_effect_when_display_disabled(self) -> None:
        wfile = BytesIO()
-        handler = make_proxy_handler(wfile)
-        handler.server.config = ProxyConfig(
-            display_reasoning=False,
-            collapsible_reasoning=True,
+        handler = _make_handler_stub(
+            wfile, display_reasoning=False, collapsible_reasoning=True
        )
        chunk = {
-            "id": "chatcmpl-stream",
+            "id": "stream",
            "model": "deepseek-v4-pro",
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"reasoning_content": "Need context."},
-                }
-            ],
+            "choices": [{"index": 0, "delta": {"reasoning_content": "Need context."}}],
        }
-        response = FakeStreamingResponse(
+        response = _FakeStreamingResponse(
            [
                f"data: {json.dumps(chunk)}\n\n".encode("utf-8"),
                b"data: [DONE]\n\n",
            ]
        )
-
        try:
-            sent = handler._proxy_streaming_response(
+            handler._proxy_streaming_response(
                response,
                "deepseek-v4-pro",
                [{"role": "user", "content": "hi"}],
-                "cache-namespace",
+                "ns",
            )
        finally:
            handler.server.reasoning_store.close()
-
        body = wfile.getvalue().decode("utf-8")
-        self.assertTrue(sent.sent)
        self.assertIn("reasoning_content", body)
        self.assertNotIn("<details>", body)
-        self.assertNotIn("<think>", body)
+
+
+# ---------------------------------------------------------------------------
+# HTTP-level boundary tests: real proxy + tiny upstream
+# ---------------------------------------------------------------------------
+
+
+class _PlainFakeUpstream(BaseHTTPRequestHandler):
+    """Returns a fixed plain response and records every request."""
+
+    requests: list[dict[str, object]] = []
+    auth_headers: list[str] = []
+    delay_after_done: float = 0.0
+    response: dict[str, object] = {}
+
+    def log_message(self, fmt: str, *args: object) -> None:
+        return
+
+    def do_POST(self) -> None:
+        length = int(self.headers.get("Content-Length") or 0)
+        payload = json.loads(self.rfile.read(length).decode("utf-8"))
+        self.__class__.requests.append(payload)
+        self.__class__.auth_headers.append(self.headers.get("Authorization", ""))
+
+        if payload.get("stream"):
+            self.send_response(200)
+            self.send_header("Content-Type", "text/event-stream")
+            self.end_headers()
+            self.wfile.write(
+                b'data: {"choices":[{"index":0,"delta":{"content":"x"}}]}\n\n'
+            )
+            self.wfile.write(b"data: [DONE]\n\n")
+            self.wfile.flush()
+            if self.__class__.delay_after_done:
+                time.sleep(self.__class__.delay_after_done)
+            return
+
+        body = json.dumps(self.__class__.response).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+
+_BASE_RESPONSE: dict[str, object] = {
+    "id": "x",
+    "object": "chat.completion",
+    "created": 1,
+    "model": "deepseek-v4-pro",
+    "choices": [
+        {
+            "index": 0,
+            "finish_reason": "stop",
+            "message": {"role": "assistant", "content": "ok"},
+        }
+    ],
+    "usage": {
+        "prompt_tokens": 20,
+        "completion_tokens": 5,
+        "total_tokens": 25,
+        "prompt_cache_hit_tokens": 12,
+        "prompt_cache_miss_tokens": 8,
+        "completion_tokens_details": {"reasoning_tokens": 3},
+    },
+}
+
+
+class _Fixture:
+    def __init__(self, server: ThreadingHTTPServer) -> None:
+        self.server = server
+        self.thread = threading.Thread(target=server.serve_forever, daemon=True)
+        self.thread.start()
+
+    @property
+    def url(self) -> str:
+        host, port = self.server.server_address
+        return f"http://{host}:{port}"
+
+    def close(self) -> None:
+        self.server.shutdown()
+        self.server.server_close()
+        self.thread.join(timeout=5)
+
+
+def _post(url: str, payload: dict, api_key: str = "sk-test") -> tuple[int, dict]:
+    request = Request(
+        url,
+        data=json.dumps(payload).encode("utf-8"),
+        method="POST",
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+    )
+    try:
+        with urlopen(request, timeout=5) as response:
+            return response.status, json.loads(response.read().decode("utf-8"))
+    except HTTPError as exc:
+        return exc.code, json.loads(exc.read().decode("utf-8"))
+
+
+class HttpBoundaryTests(unittest.TestCase):
+    """Real-HTTP tests that don't fit the protocol suite: things the proxy
+    must do at the HTTP boundary regardless of what DeepSeek answers."""
+
+    def setUp(self) -> None:
+        _PlainFakeUpstream.requests = []
+        _PlainFakeUpstream.auth_headers = []
+        _PlainFakeUpstream.delay_after_done = 0.0
+        _PlainFakeUpstream.response = dict(_BASE_RESPONSE)
+        self.upstream = _Fixture(
+            ThreadingHTTPServer(("127.0.0.1", 0), _PlainFakeUpstream)
+        )
+        self.store = ReasoningStore(":memory:")
+        proxy = DeepSeekProxyServer(("127.0.0.1", 0), DeepSeekProxyHandler)
+        proxy.config = ProxyConfig(
+            upstream_base_url=self.upstream.url,
+            upstream_model="deepseek-v4-pro",
+            ngrok=False,
+        )
+        proxy.reasoning_store = self.store
+        self.proxy = _Fixture(proxy)
+
+    def tearDown(self) -> None:
+        self.proxy.close()
+        self.upstream.close()
+        self.store.close()
+
+    def _request(self) -> dict:
+        return {
+            "model": "deepseek-v4-pro",
+            "messages": [{"role": "user", "content": "hi"}],
+        }
+
+    def test_rejects_missing_bearer_token(self) -> None:
+        request = Request(
+            f"{self.proxy.url}/v1/chat/completions",
+            data=json.dumps(self._request()).encode("utf-8"),
+            method="POST",
+            headers={"Content-Type": "application/json"},
+        )
+        with self.assertRaises(HTTPError) as caught:
+            urlopen(request, timeout=5)
+        self.assertEqual(caught.exception.code, 401)
+        self.assertEqual(_PlainFakeUpstream.requests, [])
+
+    def test_rejects_oversized_request_body(self) -> None:
+        self.proxy.server.config = replace(
+            self.proxy.server.config, max_request_body_bytes=10
+        )
+        status, payload = _post(
+            f"{self.proxy.url}/v1/chat/completions", self._request()
+        )
+        self.assertEqual(status, 413)
+        self.assertIn("too large", payload["error"]["message"])
+        self.assertEqual(_PlainFakeUpstream.requests, [])
+
+    def test_forwards_bearer_token_to_upstream(self) -> None:
+        status, _ = _post(
+            f"{self.proxy.url}/v1/chat/completions",
+            self._request(),
+            api_key="sk-from-cursor",
+        )
+        self.assertEqual(status, 200)
+        self.assertEqual(_PlainFakeUpstream.auth_headers[0], "Bearer sk-from-cursor")
+
+    def test_streaming_response_closes_after_done_when_upstream_lingers(
+        self,
+    ) -> None:
+        """Cursor relies on the proxy ending the SSE stream at [DONE], even
+        if the upstream socket stays open."""
+        _PlainFakeUpstream.delay_after_done = 2.0
+        request = Request(
+            f"{self.proxy.url}/v1/chat/completions",
+            data=json.dumps(
+                {
+                    "model": "deepseek-v4-pro",
+                    "stream": True,
+                    "messages": [{"role": "user", "content": "stream"}],
+                }
+            ).encode("utf-8"),
+            method="POST",
+            headers={
+                "Authorization": "Bearer sk-test",
+                "Content-Type": "application/json",
+            },
+        )
+        started = time.monotonic()
+        with urlopen(request, timeout=1) as response:
+            body = response.read().decode("utf-8")
+        self.assertLess(time.monotonic() - started, 1.0)
+        self.assertIn("data: [DONE]", body)
+
+    def test_normal_logging_summarizes_without_bodies_or_keys(self) -> None:
+        with self.assertLogs("deepseek_cursor_proxy", level="INFO") as captured:
+            status, _ = _post(
+                f"{self.proxy.url}/v1/chat/completions",
+                self._request(),
+                api_key="sk-from-cursor",
+            )
+            # `└ stats` is emitted on the handler thread *after* the response
+            # body hits the socket, so the client may return before it lands.
+            deadline = time.monotonic() + 2
+            while time.monotonic() < deadline and not any(
+                "└ stats" in record for record in captured.output
+            ):
+                time.sleep(0.01)
+        output = "\n".join(captured.output)
+        self.assertEqual(status, 200)
+        # Single-line stage records keep the log readable.
+        for marker in ("┌ cursor", "├ context", "├ send", "└ stats"):
+            self.assertIn(marker, output)
+        self.assertNotIn("hi", output.split("┌ cursor")[1].split("\n")[0])
+        self.assertNotIn("sk-from-cursor", output)
+
+    def test_verbose_logging_includes_bodies_but_redacts_api_key(self) -> None:
+        self.proxy.server.config = replace(self.proxy.server.config, verbose=True)
+        with self.assertLogs("deepseek_cursor_proxy", level="INFO") as captured:
+            _post(
+                f"{self.proxy.url}/v1/chat/completions",
+                self._request(),
+                api_key="sk-from-cursor",
+            )
+        output = "\n".join(captured.output)
+        self.assertIn("cursor request body", output)
+        self.assertIn("upstream request body", output)
+        self.assertNotIn("sk-from-cursor", output)
+
+    def test_healthz_returns_ok(self) -> None:
+        with urlopen(f"{self.proxy.url}/healthz", timeout=2) as response:
+            self.assertEqual(response.status, 200)
+            self.assertEqual(json.loads(response.read())["ok"], True)


 if __name__ == "__main__":
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@ -6,6 +6,7 @@ from deepseek_cursor_proxy.reasoning_store import ReasoningStore, conversation_s
 from deepseek_cursor_proxy.streaming import (
    CursorReasoningDisplayAdapter,
    StreamAccumulator,
+    fold_reasoning_into_content,
 )


@ -430,5 +431,44 @@ class CursorReasoningDisplayAdapterTests(unittest.TestCase):
        self.assertIsNone(adapter.flush_chunk("deepseek-v4-pro"))


+class FoldReasoningTests(unittest.TestCase):
+    def test_fold_reasoning_into_non_streaming_content(self) -> None:
+        """Non-streaming responses mirror reasoning_content into a visible
+        <details> block, matching the streaming layout."""
+        payload = {
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": "answer",
+                        "reasoning_content": "thinking",
+                    },
+                }
+            ]
+        }
+        fold_reasoning_into_content(payload, collapsible=True)
+        self.assertEqual(
+            payload["choices"][0]["message"]["content"],
+            "<details>\n<summary>Thinking</summary>\n\nthinking\n</details>\n\nanswer",
+        )
+
+    def test_fold_reasoning_skips_empty_reasoning(self) -> None:
+        payload = {
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": "answer",
+                        "reasoning_content": "",
+                    },
+                }
+            ]
+        }
+        fold_reasoning_into_content(payload, collapsible=True)
+        self.assertEqual(payload["choices"][0]["message"]["content"], "answer")
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_trace.py
+++ b/tests/test_trace.py
@ -1,14 +1,25 @@
+"""Trace writer tests, both as a unit (writes/redacts files) and integrated
+through the proxy (captures real request flow on disk)."""
+
 from __future__ import annotations

+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 import json
+from pathlib import Path
 import stat
+import threading
 from tempfile import TemporaryDirectory
+import time
 import unittest
+from urllib.request import Request, urlopen

+from deepseek_cursor_proxy.config import ProxyConfig
+from deepseek_cursor_proxy.reasoning_store import ReasoningStore
+from deepseek_cursor_proxy.server import DeepSeekProxyHandler, DeepSeekProxyServer
 from deepseek_cursor_proxy.trace import TraceWriter


-class TraceWriterTests(unittest.TestCase):
+class TraceWriterUnitTests(unittest.TestCase):
    def test_writes_manifest_and_numbered_request_files(self) -> None:
        with TemporaryDirectory() as temp_dir:
            writer = TraceWriter(temp_dir)
@ -47,17 +58,244 @@ class TraceWriterTests(unittest.TestCase):
                headers={"Authorization": "Bearer sk-secret"},
            )
            trace.finish("completed", http_status=200)
-
-            payload = json.loads(trace.path.read_text(encoding="utf-8"))
-            serialized = json.dumps(payload)
-
+            serialized = trace.path.read_text(encoding="utf-8")
            self.assertNotIn("sk-secret", serialized)
+            payload = json.loads(serialized)
            self.assertEqual(
-                payload["request"]["headers"]["Authorization"]["present"],
-                True,
+                payload["request"]["headers"]["Authorization"]["present"], True
            )
            self.assertIn("sha256", payload["request"]["headers"]["Authorization"])


+# ---------------------------------------------------------------------------
+# Integration: trace writer attached to a running proxy.
+# ---------------------------------------------------------------------------
+
+
+class _CannedUpstream(BaseHTTPRequestHandler):
+    """Returns a tool-call response for the first POST and a streamed
+    reasoning response for the second."""
+
+    requests: list[dict[str, object]] = []
+
+    def log_message(self, fmt: str, *args: object) -> None:
+        return
+
+    def do_POST(self) -> None:
+        length = int(self.headers.get("Content-Length") or 0)
+        payload = json.loads(self.rfile.read(length).decode("utf-8"))
+        self.__class__.requests.append(payload)
+
+        if payload.get("stream"):
+            self.send_response(200)
+            self.send_header("Content-Type", "text/event-stream")
+            self.end_headers()
+            self.wfile.write(
+                b'data: {"id":"s","object":"chat.completion.chunk","choices":'
+                b'[{"index":0,"delta":{"role":"assistant","reasoning_content":"think"},'
+                b'"finish_reason":null}]}\n\n'
+            )
+            self.wfile.write(
+                b'data: {"id":"s","object":"chat.completion.chunk","choices":'
+                b'[{"index":0,"delta":{"content":"answer"},"finish_reason":null}],'
+                b'"usage":{"completion_tokens_details":{"reasoning_tokens":1}}}\n\n'
+            )
+            self.wfile.write(
+                b'data: {"id":"s","object":"chat.completion.chunk",'
+                b'"choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n'
+            )
+            self.wfile.write(b"data: [DONE]\n\n")
+            self.wfile.flush()
+            return
+
+        body = json.dumps(
+            {
+                "id": "tool",
+                "object": "chat.completion",
+                "model": "deepseek-v4-pro",
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls",
+                        "message": {
+                            "role": "assistant",
+                            "content": "",
+                            "reasoning_content": "I need the date.",
+                            "tool_calls": [
+                                {
+                                    "id": "call_date",
+                                    "type": "function",
+                                    "function": {
+                                        "name": "get_date",
+                                        "arguments": "{}",
+                                    },
+                                }
+                            ],
+                        },
+                    }
+                ],
+            }
+        ).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+
+class _Fixture:
+    def __init__(self, server: ThreadingHTTPServer) -> None:
+        self.server = server
+        self.thread = threading.Thread(target=server.serve_forever, daemon=True)
+        self.thread.start()
+
+    @property
+    def url(self) -> str:
+        host, port = self.server.server_address
+        return f"http://{host}:{port}"
+
+    def close(self) -> None:
+        self.server.shutdown()
+        self.server.server_close()
+        self.thread.join(timeout=5)
+
+
+def _read_single_trace(session_dir: Path) -> dict:
+    deadline = time.monotonic() + 2
+    files = sorted(session_dir.glob("request-*.json"))
+    while not files and time.monotonic() < deadline:
+        time.sleep(0.01)
+        files = sorted(session_dir.glob("request-*.json"))
+    if len(files) != 1:
+        raise AssertionError(f"expected one trace, found {files}")
+    return json.loads(files[0].read_text(encoding="utf-8"))
+
+
+class TraceIntegrationTests(unittest.TestCase):
+    def setUp(self) -> None:
+        _CannedUpstream.requests = []
+        self.upstream = _Fixture(ThreadingHTTPServer(("127.0.0.1", 0), _CannedUpstream))
+        self.store = ReasoningStore(":memory:")
+        self.temp_dir = TemporaryDirectory()
+        self.writer = TraceWriter(self.temp_dir.name)
+        proxy = DeepSeekProxyServer(("127.0.0.1", 0), DeepSeekProxyHandler)
+        proxy.config = ProxyConfig(
+            upstream_base_url=self.upstream.url,
+            upstream_model="deepseek-v4-pro",
+            ngrok=False,
+        )
+        proxy.reasoning_store = self.store
+        proxy.trace_writer = self.writer
+        self.proxy = _Fixture(proxy)
+
+    def tearDown(self) -> None:
+        self.proxy.close()
+        self.upstream.close()
+        self.store.close()
+        self.temp_dir.cleanup()
+
+    def _post(self, payload: dict) -> dict:
+        request = Request(
+            f"{self.proxy.url}/v1/chat/completions",
+            data=json.dumps(payload).encode("utf-8"),
+            method="POST",
+            headers={
+                "Authorization": "Bearer sk-from-cursor",
+                "Content-Type": "application/json",
+            },
+        )
+        with urlopen(request, timeout=5) as response:
+            return json.loads(response.read())
+
+    def test_captures_non_streaming_replay_without_api_key(self) -> None:
+        self._post(
+            {
+                "model": "deepseek-v4-pro",
+                "messages": [{"role": "user", "content": "What is tomorrow's date?"}],
+            }
+        )
+        trace = _read_single_trace(self.writer.session_dir)
+        serialized = json.dumps(trace)
+        self.assertEqual(trace["completion"]["status"], "completed")
+        self.assertEqual(
+            trace["request"]["body"]["messages"][0]["content"],
+            "What is tomorrow's date?",
+        )
+        self.assertEqual(
+            trace["upstream"]["response"]["body"]["json"]["choices"][0]["message"][
+                "reasoning_content"
+            ],
+            "I need the date.",
+        )
+        self.assertNotIn("sk-from-cursor", serialized)
+
+    def test_captures_streaming_replay_chunks(self) -> None:
+        request = Request(
+            f"{self.proxy.url}/v1/chat/completions",
+            data=json.dumps(
+                {
+                    "model": "deepseek-v4-pro",
+                    "stream": True,
+                    "messages": [{"role": "user", "content": "stream"}],
+                }
+            ).encode("utf-8"),
+            method="POST",
+            headers={
+                "Authorization": "Bearer sk-test",
+                "Content-Type": "application/json",
+            },
+        )
+        with urlopen(request, timeout=2) as response:
+            response.read()
+        trace = _read_single_trace(self.writer.session_dir)
+        self.assertEqual(trace["completion"]["status"], "completed")
+        self.assertIn(
+            "reasoning_content",
+            trace["upstream"]["stream"]["chunks"][0]["line"],
+        )
+        self.assertIn(
+            "<details>", trace["cursor_response"]["stream"]["chunks"][0]["line"]
+        )
+
+    def test_captures_recovery_diagnostics(self) -> None:
+        """A request that triggers cold-cache recovery records the recovery
+        steps + diagnostic counters in the trace."""
+        self._post(
+            {
+                "model": "deepseek-v4-pro",
+                "messages": [
+                    {"role": "user", "content": "old"},
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "id": "call_x",
+                                "type": "function",
+                                "function": {"name": "f", "arguments": "{}"},
+                            }
+                        ],
+                    },
+                    {"role": "tool", "tool_call_id": "call_x", "content": "result"},
+                    {"role": "user", "content": "new"},
+                ],
+            }
+        )
+        trace = _read_single_trace(self.writer.session_dir)
+        self.assertEqual(
+            trace["transform"]["recovery_steps"][0]["strategy"], "latest_user"
+        )
+        self.assertGreaterEqual(
+            len(
+                [
+                    item
+                    for item in trace["transform"]["reasoning_diagnostics"]
+                    if item["missing"]
+                ]
+            ),
+            1,
+        )
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_transform.py
+++ b/tests/test_transform.py