From 16d503c4d6ef41b9e50471da46a417dc9731f4f9 Mon Sep 17 00:00:00 2001 From: Wen-Tien Chang Date: Wed, 26 Nov 2025 18:51:15 +0800 Subject: [PATCH] Fix: preserve usage from earlier stream chunks when later chunks have none --- src/agents/models/chatcmpl_stream_handler.py | 4 +- tests/test_reasoning_content.py | 54 ++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py index 94dd10205..f1c504977 100644 --- a/src/agents/models/chatcmpl_stream_handler.py +++ b/src/agents/models/chatcmpl_stream_handler.py @@ -97,7 +97,9 @@ async def handle_stream( ) # This is always set by the OpenAI API, but not by others e.g. LiteLLM - usage = chunk.usage if hasattr(chunk, "usage") else None + # Only update when chunk has usage data (not always in the last chunk) + if hasattr(chunk, "usage") and chunk.usage is not None: + usage = chunk.usage if not chunk.choices or not chunk.choices[0].delta: continue diff --git a/tests/test_reasoning_content.py b/tests/test_reasoning_content.py index a64fdaf15..b9d7fa91f 100644 --- a/tests/test_reasoning_content.py +++ b/tests/test_reasoning_content.py @@ -234,6 +234,60 @@ async def patched_fetch_response(self, *args, **kwargs): assert resp.output[1].content[0].text == "The answer is 42" +@pytest.mark.allow_call_model_methods +@pytest.mark.asyncio +async def test_stream_response_preserves_usage_from_earlier_chunk(monkeypatch) -> None: + """ + Test that when an earlier chunk has usage data and later chunks don't, + the usage from the earlier chunk is preserved in the final response. + This handles cases where some providers (e.g., LiteLLM) may not include + usage in every chunk. + """ + # Create test chunks where first chunk has usage, last chunk doesn't + chunks = [ + create_chunk(create_content_delta("Hello"), include_usage=True), # Has usage + create_chunk(create_content_delta("")), # No usage (usage=None) + ] + + async def patched_fetch_response(self, *args, **kwargs): + resp = Response( + id="resp-id", + created_at=0, + model="fake-model", + object="response", + output=[], + tool_choice="none", + tools=[], + parallel_tool_calls=False, + ) + return resp, create_fake_stream(chunks) + + monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response) + model = OpenAIProvider(use_responses=False).get_model("gpt-4") + output_events = [] + async for event in model.stream_response( + system_instructions=None, + input="", + model_settings=ModelSettings(), + tools=[], + output_schema=None, + handoffs=[], + tracing=ModelTracing.DISABLED, + previous_response_id=None, + conversation_id=None, + prompt=None, + ): + output_events.append(event) + + # Verify the final response preserves usage from the first chunk + response_event = output_events[-1] + assert response_event.type == "response.completed" + assert response_event.response.usage is not None + assert response_event.response.usage.input_tokens == 2 + assert response_event.response.usage.output_tokens == 4 + assert response_event.response.usage.total_tokens == 6 + + @pytest.mark.allow_call_model_methods @pytest.mark.asyncio async def test_stream_response_with_empty_reasoning_content(monkeypatch) -> None: