From 16d503c4d6ef41b9e50471da46a417dc9731f4f9 Mon Sep 17 00:00:00 2001
From: Wen-Tien Chang <ihower@gmail.com>
Date: Wed, 26 Nov 2025 18:51:15 +0800
Subject: [PATCH] Fix: preserve usage from earlier stream chunks when later
 chunks have none

---
 src/agents/models/chatcmpl_stream_handler.py |  4 +-
 tests/test_reasoning_content.py              | 54 ++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py
index 94dd10205..f1c504977 100644
--- a/src/agents/models/chatcmpl_stream_handler.py
+++ b/src/agents/models/chatcmpl_stream_handler.py
@@ -97,7 +97,9 @@ async def handle_stream(
                 )
 
             # This is always set by the OpenAI API, but not by others e.g. LiteLLM
-            usage = chunk.usage if hasattr(chunk, "usage") else None
+            # Only update when chunk has usage data (not always in the last chunk)
+            if hasattr(chunk, "usage") and chunk.usage is not None:
+                usage = chunk.usage
 
             if not chunk.choices or not chunk.choices[0].delta:
                 continue
diff --git a/tests/test_reasoning_content.py b/tests/test_reasoning_content.py
index a64fdaf15..b9d7fa91f 100644
--- a/tests/test_reasoning_content.py
+++ b/tests/test_reasoning_content.py
@@ -234,6 +234,60 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert resp.output[1].content[0].text == "The answer is 42"
 
 
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_stream_response_preserves_usage_from_earlier_chunk(monkeypatch) -> None:
+    """
+    Test that when an earlier chunk has usage data and later chunks don't,
+    the usage from the earlier chunk is preserved in the final response.
+    This handles cases where some providers (e.g., LiteLLM) may not include
+    usage in every chunk.
+    """
+    # Create test chunks where first chunk has usage, last chunk doesn't
+    chunks = [
+        create_chunk(create_content_delta("Hello"), include_usage=True),  # Has usage
+        create_chunk(create_content_delta("")),  # No usage (usage=None)
+    ]
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        resp = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return resp, create_fake_stream(chunks)
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+    output_events = []
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    # Verify the final response preserves usage from the first chunk
+    response_event = output_events[-1]
+    assert response_event.type == "response.completed"
+    assert response_event.response.usage is not None
+    assert response_event.response.usage.input_tokens == 2
+    assert response_event.response.usage.output_tokens == 4
+    assert response_event.response.usage.total_tokens == 6
+
+
 @pytest.mark.allow_call_model_methods
 @pytest.mark.asyncio
 async def test_stream_response_with_empty_reasoning_content(monkeypatch) -> None: