Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/agents/models/chatcmpl_stream_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ async def handle_stream(
)

# This is always set by the OpenAI API, but not by others e.g. LiteLLM
usage = chunk.usage if hasattr(chunk, "usage") else None
# Only update when chunk has usage data (not always in the last chunk)
if hasattr(chunk, "usage") and chunk.usage is not None:
usage = chunk.usage

if not chunk.choices or not chunk.choices[0].delta:
continue
Expand Down
54 changes: 54 additions & 0 deletions tests/test_reasoning_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,60 @@ async def patched_fetch_response(self, *args, **kwargs):
assert resp.output[1].content[0].text == "The answer is 42"


@pytest.mark.allow_call_model_methods
@pytest.mark.asyncio
async def test_stream_response_preserves_usage_from_earlier_chunk(monkeypatch) -> None:
"""
Test that when an earlier chunk has usage data and later chunks don't,
the usage from the earlier chunk is preserved in the final response.
This handles cases where some providers (e.g., LiteLLM) may not include
usage in every chunk.
"""
# Create test chunks where first chunk has usage, last chunk doesn't
chunks = [
create_chunk(create_content_delta("Hello"), include_usage=True), # Has usage
create_chunk(create_content_delta("")), # No usage (usage=None)
]

async def patched_fetch_response(self, *args, **kwargs):
resp = Response(
id="resp-id",
created_at=0,
model="fake-model",
object="response",
output=[],
tool_choice="none",
tools=[],
parallel_tool_calls=False,
)
return resp, create_fake_stream(chunks)

monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
model = OpenAIProvider(use_responses=False).get_model("gpt-4")
output_events = []
async for event in model.stream_response(
system_instructions=None,
input="",
model_settings=ModelSettings(),
tools=[],
output_schema=None,
handoffs=[],
tracing=ModelTracing.DISABLED,
previous_response_id=None,
conversation_id=None,
prompt=None,
):
output_events.append(event)

# Verify the final response preserves usage from the first chunk
response_event = output_events[-1]
assert response_event.type == "response.completed"
assert response_event.response.usage is not None
assert response_event.response.usage.input_tokens == 2
assert response_event.response.usage.output_tokens == 4
assert response_event.response.usage.total_tokens == 6


@pytest.mark.allow_call_model_methods
@pytest.mark.asyncio
async def test_stream_response_with_empty_reasoning_content(monkeypatch) -> None:
Expand Down