Fix guardrail task cleanup to properly await cancelled tasks

gn00295120 · gn00295120 · commit eb447fbdd3ab · 2025-10-23T00:50:45.000+08:00
Problem:
The _cleanup_guardrail_tasks() method in RealtimeSession was only calling
task.cancel() on pending guardrail tasks but not awaiting them. This could
lead to:
1. Unhandled task exception warnings
2. Potential memory leaks from abandoned tasks
3. Improper resource cleanup

Evidence:
- Test code in tests/realtime/test_session.py:1199 shows the correct pattern:
 await asyncio.gather(*session._guardrail_tasks, return_exceptions=True)
- Similar pattern used in openai_realtime.py:519-523 for WebSocket task cleanup

Solution:
1. Made _cleanup_guardrail_tasks() async
2. Added await asyncio.gather() for real asyncio.Task objects to properly
 collect exceptions (with isinstance check to support mock objects in tests)
3. Updated _cleanup() to await the cleanup method

Testing:
- Created comprehensive test suite in tests/realtime/test_guardrail_cleanup.py
 with 3 test cases:
 1. Verify cancelled tasks are properly awaited
 2. Verify exceptions during cleanup are handled
 3. Verify multiple concurrent tasks are cleaned up
- All new tests pass
- All existing tests pass (838 passed, 3 skipped)
- Note: test_issue_889_guardrail_tool_execution has 1 pre-existing failure
 unrelated to this PR (also fails on main)
diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
@@ -746,16 +746,32 @@ def _on_guardrail_task_done(self, task: asyncio.Task[Any]) -> None:
 )
 )
 
- def _cleanup_guardrail_tasks(self) -> None:
+ async def _cleanup_guardrail_tasks(self) -> None:
+ """Cancel all pending guardrail tasks and wait for them to complete.
+
+ This ensures that any exceptions raised by the tasks are properly handled
+ and prevents warnings about unhandled task exceptions.
+ """
+ # Collect real asyncio.Task objects that need to be awaited
+ real_tasks = []
+
 for task in self._guardrail_tasks:
 if not task.done():
 task.cancel()
+ # Only await real asyncio.Task objects (not mocks in tests)
+ if isinstance(task, asyncio.Task):
+ real_tasks.append(task)
+
+ # Wait for all real tasks to complete and collect any exceptions
+ if real_tasks:
+ await asyncio.gather(*real_tasks, return_exceptions=True)
+
 self._guardrail_tasks.clear()
 
 async def _cleanup(self) -> None:
 """Clean up all resources and mark session as closed."""
 # Cancel and cleanup guardrail tasks
- self._cleanup_guardrail_tasks()
+ await self._cleanup_guardrail_tasks()
 
 # Remove ourselves as a listener
 self._model.remove_listener(self)
diff --git a/tests/realtime/test_guardrail_cleanup.py b/tests/realtime/test_guardrail_cleanup.py
@@ -0,0 +1,246 @@
+"""Test guardrail task cleanup to ensure proper exception handling.
+
+This test verifies the fix for the bug where _cleanup_guardrail_tasks() was not
+properly awaiting cancelled tasks, which could lead to unhandled task exceptions
+and potential memory leaks.
+"""
+
+import asyncio
+from unittest.mock import AsyncMock, Mock, PropertyMock
+
+import pytest
+
+from agents.guardrail import GuardrailFunctionOutput, OutputGuardrail
+from agents.realtime import RealtimeSession
+from agents.realtime.agent import RealtimeAgent
+from agents.realtime.config import RealtimeRunConfig
+from agents.realtime.model import RealtimeModel
+from agents.realtime.model_events import RealtimeModelTranscriptDeltaEvent
+
+
+class MockRealtimeModel(RealtimeModel):
+ """Mock realtime model for testing."""
+
+ def __init__(self):
+ super().__init__()
+ self.listeners = []
+ self.connect_called = False
+ self.close_called = False
+ self.sent_events = []
+ self.sent_messages = []
+ self.sent_audio = []
+ self.sent_tool_outputs = []
+ self.interrupts_called = 0
+
+ async def connect(self, options=None):
+ self.connect_called = True
+
+ def add_listener(self, listener):
+ self.listeners.append(listener)
+
+ def remove_listener(self, listener):
+ if listener in self.listeners:
+ self.listeners.remove(listener)
+
+ async def send_event(self, event):
+ from agents.realtime.model_inputs import (
+ RealtimeModelSendAudio,
+ RealtimeModelSendInterrupt,
+ RealtimeModelSendToolOutput,
+ RealtimeModelSendUserInput,
+ )
+
+ self.sent_events.append(event)
+
+ # Update legacy tracking for compatibility
+ if isinstance(event, RealtimeModelSendUserInput):
+ self.sent_messages.append(event.user_input)
+ elif isinstance(event, RealtimeModelSendAudio):
+ self.sent_audio.append((event.audio, event.commit))
+ elif isinstance(event, RealtimeModelSendToolOutput):
+ self.sent_tool_outputs.append((event.tool_call, event.output, event.start_response))
+ elif isinstance(event, RealtimeModelSendInterrupt):
+ self.interrupts_called += 1
+
+ async def close(self):
+ self.close_called = True
+
+
+@pytest.fixture
+def mock_model():
+ return MockRealtimeModel()
+
+
+@pytest.fixture
+def mock_agent():
+ agent = Mock(spec=RealtimeAgent)
+ agent.name = "test_agent"
+ agent.get_all_tools = AsyncMock(return_value=[])
+ type(agent).handoffs = PropertyMock(return_value=[])
+ type(agent).output_guardrails = PropertyMock(return_value=[])
+ return agent
+
+
+@pytest.mark.asyncio
+async def test_guardrail_task_cleanup_awaits_cancelled_tasks(mock_model, mock_agent):
+ """Test that cleanup properly awaits cancelled guardrail tasks.
+
+ This test verifies that when guardrail tasks are cancelled during cleanup,
+ the cleanup method properly awaits them to completion using asyncio.gather()
+ with return_exceptions=True. This ensures:
+ 1. No warnings about unhandled task exceptions
+ 2. Proper resource cleanup
+ 3. No memory leaks from abandoned tasks
+ """
+
+ # Create a guardrail that runs a long async operation
+ task_started = asyncio.Event()
+ task_cancelled = asyncio.Event()
+
+ async def slow_guardrail_func(context, agent, output):
+ """A guardrail that takes time to execute."""
+ task_started.set()
+ try:
+ # Simulate a long-running operation
+ await asyncio.sleep(10)
+ return GuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
+ except asyncio.CancelledError:
+ task_cancelled.set()
+ raise
+
+ guardrail = OutputGuardrail(guardrail_function=slow_guardrail_func, name="slow_guardrail")
+
+ run_config: RealtimeRunConfig ={
+ "output_guardrails": [guardrail],
+ "guardrails_settings":{"debounce_text_length": 5},
+ }
+
+ session = RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
+
+ # Trigger a guardrail by sending a transcript delta
+ transcript_event = RealtimeModelTranscriptDeltaEvent(
+ item_id="item_1", delta="hello world", response_id="resp_1"
+ )
+
+ await session.on_event(transcript_event)
+
+ # Wait for the guardrail task to start
+ await asyncio.wait_for(task_started.wait(), timeout=1.0)
+
+ # Verify a guardrail task was created
+ assert len(session._guardrail_tasks) == 1
+ task = list(session._guardrail_tasks)[0]
+ assert not task.done()
+
+ # Now cleanup the session - this should cancel and await the task
+ await session._cleanup_guardrail_tasks()
+
+ # Verify the task was cancelled and properly awaited
+ assert task_cancelled.is_set(), "Task should have received CancelledError"
+ assert len(session._guardrail_tasks) == 0, "Tasks list should be cleared"
+
+ # No warnings should be raised about unhandled task exceptions
+
+
+@pytest.mark.asyncio
+async def test_guardrail_task_cleanup_with_exception(mock_model, mock_agent):
+ """Test that cleanup handles guardrail tasks that raise exceptions.
+
+ This test verifies that if a guardrail task raises an exception (not just
+ CancelledError), the cleanup method still completes successfully and doesn't
+ propagate the exception, thanks to return_exceptions=True.
+ """
+
+ task_started = asyncio.Event()
+ exception_raised = asyncio.Event()
+
+ async def failing_guardrail_func(context, agent, output):
+ """A guardrail that raises an exception."""
+ task_started.set()
+ try:
+ await asyncio.sleep(10)
+ return GuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
+ except asyncio.CancelledError as e:
+ exception_raised.set()
+ # Simulate an error during cleanup
+ raise RuntimeError("Cleanup error") from e
+
+ guardrail = OutputGuardrail(
+ guardrail_function=failing_guardrail_func, name="failing_guardrail"
+ )
+
+ run_config: RealtimeRunConfig ={
+ "output_guardrails": [guardrail],
+ "guardrails_settings":{"debounce_text_length": 5},
+ }
+
+ session = RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
+
+ # Trigger a guardrail
+ transcript_event = RealtimeModelTranscriptDeltaEvent(
+ item_id="item_1", delta="hello world", response_id="resp_1"
+ )
+
+ await session.on_event(transcript_event)
+
+ # Wait for the guardrail task to start
+ await asyncio.wait_for(task_started.wait(), timeout=1.0)
+
+ # Cleanup should not raise the RuntimeError due to return_exceptions=True
+ await session._cleanup_guardrail_tasks()
+
+ # Verify cleanup completed successfully
+ assert exception_raised.is_set()
+ assert len(session._guardrail_tasks) == 0
+
+
+@pytest.mark.asyncio
+async def test_guardrail_task_cleanup_with_multiple_tasks(mock_model, mock_agent):
+ """Test cleanup with multiple pending guardrail tasks.
+
+ This test verifies that cleanup properly handles multiple concurrent guardrail
+ tasks by triggering guardrails multiple times, then cancelling and awaiting all of them.
+ """
+
+ tasks_started = asyncio.Event()
+ tasks_cancelled = 0
+
+ async def slow_guardrail_func(context, agent, output):
+ nonlocal tasks_cancelled
+ tasks_started.set()
+ try:
+ await asyncio.sleep(10)
+ return GuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
+ except asyncio.CancelledError:
+ tasks_cancelled += 1
+ raise
+
+ guardrail = OutputGuardrail(guardrail_function=slow_guardrail_func, name="slow_guardrail")
+
+ run_config: RealtimeRunConfig ={
+ "output_guardrails": [guardrail],
+ "guardrails_settings":{"debounce_text_length": 5},
+ }
+
+ session = RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
+
+ # Trigger guardrails multiple times to create multiple tasks
+ for i in range(3):
+ transcript_event = RealtimeModelTranscriptDeltaEvent(
+ item_id=f"item_{i}", delta="hello world", response_id=f"resp_{i}"
+ )
+ await session.on_event(transcript_event)
+
+ # Wait for at least one task to start
+ await asyncio.wait_for(tasks_started.wait(), timeout=1.0)
+
+ # Should have at least one guardrail task
+ initial_task_count = len(session._guardrail_tasks)
+ assert initial_task_count >= 1, "At least one guardrail task should exist"
+
+ # Cleanup should cancel and await all tasks
+ await session._cleanup_guardrail_tasks()
+
+ # Verify all tasks were cancelled and cleared
+ assert tasks_cancelled >= 1, "At least one task should have been cancelled"
+ assert len(session._guardrail_tasks) == 0

-Original file line number
+Diff line change
@@ @@ -0,0 +1,246 @@ @@
 +"""Test guardrail task cleanup to ensure proper exception handling.
++
 +This test verifies the fix for the bug where _cleanup_guardrail_tasks() was not
 +properly awaiting cancelled tasks, which could lead to unhandled task exceptions
 +and potential memory leaks.
 +"""
++
 +importasyncio
 +fromunittest.mockimportAsyncMock, Mock, PropertyMock
++
 +importpytest
++
 +fromagents.guardrailimportGuardrailFunctionOutput, OutputGuardrail
 +fromagents.realtimeimportRealtimeSession
 +fromagents.realtime.agentimportRealtimeAgent
 +fromagents.realtime.configimportRealtimeRunConfig
 +fromagents.realtime.modelimportRealtimeModel
 +fromagents.realtime.model_eventsimportRealtimeModelTranscriptDeltaEvent
++
++
 +classMockRealtimeModel(RealtimeModel):
 +"""Mock realtime model for testing."""
++
 +def__init__(self):
 +super().__init__()
 +self.listeners= []
 +self.connect_called=False
 +self.close_called=False
 +self.sent_events= []
 +self.sent_messages= []
 +self.sent_audio= []
 +self.sent_tool_outputs= []
 +self.interrupts_called=0
++
 +asyncdefconnect(self, options=None):
 +self.connect_called=True
++
 +defadd_listener(self, listener):
 +self.listeners.append(listener)
++
 +defremove_listener(self, listener):
 +iflistenerinself.listeners:
 +self.listeners.remove(listener)
++
 +asyncdefsend_event(self, event):
 +fromagents.realtime.model_inputsimport (
 +RealtimeModelSendAudio,
 +RealtimeModelSendInterrupt,
 +RealtimeModelSendToolOutput,
 +RealtimeModelSendUserInput,
 + )
++
 +self.sent_events.append(event)
++
 +# Update legacy tracking for compatibility
 +ifisinstance(event, RealtimeModelSendUserInput):
 +self.sent_messages.append(event.user_input)
 +elifisinstance(event, RealtimeModelSendAudio):
 +self.sent_audio.append((event.audio, event.commit))
 +elifisinstance(event, RealtimeModelSendToolOutput):
 +self.sent_tool_outputs.append((event.tool_call, event.output, event.start_response))
 +elifisinstance(event, RealtimeModelSendInterrupt):
 +self.interrupts_called+=1
++
 +asyncdefclose(self):
 +self.close_called=True
++
++
 +@pytest.fixture
 +defmock_model():
 +returnMockRealtimeModel()
++
++
 +@pytest.fixture
 +defmock_agent():
 +agent=Mock(spec=RealtimeAgent)
 +agent.name="test_agent"
 +agent.get_all_tools=AsyncMock(return_value=[])
 + type(agent).handoffs=PropertyMock(return_value=[])
 + type(agent).output_guardrails=PropertyMock(return_value=[])
 +returnagent
++
++
 +@pytest.mark.asyncio
 +asyncdeftest_guardrail_task_cleanup_awaits_cancelled_tasks(mock_model, mock_agent):
 +"""Test that cleanup properly awaits cancelled guardrail tasks.
++
 + This test verifies that when guardrail tasks are cancelled during cleanup,
 + the cleanup method properly awaits them to completion using asyncio.gather()
 + with return_exceptions=True. This ensures:
 + 1. No warnings about unhandled task exceptions
 + 2. Proper resource cleanup
 + 3. No memory leaks from abandoned tasks
 + """
++
 +# Create a guardrail that runs a long async operation
 +task_started=asyncio.Event()
 +task_cancelled=asyncio.Event()
++
 +asyncdefslow_guardrail_func(context, agent, output):
 +"""A guardrail that takes time to execute."""
 +task_started.set()
 +try:
 +# Simulate a long-running operation
 +awaitasyncio.sleep(10)
 +returnGuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
 +exceptasyncio.CancelledError:
 +task_cancelled.set()
 +raise
++
 +guardrail=OutputGuardrail(guardrail_function=slow_guardrail_func, name="slow_guardrail")
++
 +run_config: RealtimeRunConfig={
 +"output_guardrails": [guardrail],
 +"guardrails_settings":{"debounce_text_length": 5},
 + }
++
 +session=RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
++
 +# Trigger a guardrail by sending a transcript delta
 +transcript_event=RealtimeModelTranscriptDeltaEvent(
 +item_id="item_1", delta="hello world", response_id="resp_1"
 + )
++
 +awaitsession.on_event(transcript_event)
++
 +# Wait for the guardrail task to start
 +awaitasyncio.wait_for(task_started.wait(), timeout=1.0)
++
 +# Verify a guardrail task was created
 +assertlen(session._guardrail_tasks) ==1
 +task=list(session._guardrail_tasks)[0]
 +assertnottask.done()
++
 +# Now cleanup the session - this should cancel and await the task
 +awaitsession._cleanup_guardrail_tasks()
++
 +# Verify the task was cancelled and properly awaited
 +asserttask_cancelled.is_set(), "Task should have received CancelledError"
 +assertlen(session._guardrail_tasks) ==0, "Tasks list should be cleared"
++
 +# No warnings should be raised about unhandled task exceptions
++
++
 +@pytest.mark.asyncio
 +asyncdeftest_guardrail_task_cleanup_with_exception(mock_model, mock_agent):
 +"""Test that cleanup handles guardrail tasks that raise exceptions.
++
 + This test verifies that if a guardrail task raises an exception (not just
 + CancelledError), the cleanup method still completes successfully and doesn't
 + propagate the exception, thanks to return_exceptions=True.
 + """
++
 +task_started=asyncio.Event()
 +exception_raised=asyncio.Event()
++
 +asyncdeffailing_guardrail_func(context, agent, output):
 +"""A guardrail that raises an exception."""
 +task_started.set()
 +try:
 +awaitasyncio.sleep(10)
 +returnGuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
 +exceptasyncio.CancelledErrorase:
 +exception_raised.set()
 +# Simulate an error during cleanup
 +raiseRuntimeError("Cleanup error") frome
++
 +guardrail=OutputGuardrail(
 +guardrail_function=failing_guardrail_func, name="failing_guardrail"
 + )
++
 +run_config: RealtimeRunConfig={
 +"output_guardrails": [guardrail],
 +"guardrails_settings":{"debounce_text_length": 5},
 + }
++
 +session=RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
++
 +# Trigger a guardrail
 +transcript_event=RealtimeModelTranscriptDeltaEvent(
 +item_id="item_1", delta="hello world", response_id="resp_1"
 + )
++
 +awaitsession.on_event(transcript_event)
++
 +# Wait for the guardrail task to start
 +awaitasyncio.wait_for(task_started.wait(), timeout=1.0)
++
 +# Cleanup should not raise the RuntimeError due to return_exceptions=True
 +awaitsession._cleanup_guardrail_tasks()
++
 +# Verify cleanup completed successfully
 +assertexception_raised.is_set()
 +assertlen(session._guardrail_tasks) ==0
++
++
 +@pytest.mark.asyncio
 +asyncdeftest_guardrail_task_cleanup_with_multiple_tasks(mock_model, mock_agent):
 +"""Test cleanup with multiple pending guardrail tasks.
++
 + This test verifies that cleanup properly handles multiple concurrent guardrail
 + tasks by triggering guardrails multiple times, then cancelling and awaiting all of them.
 + """
++
 +tasks_started=asyncio.Event()
 +tasks_cancelled=0
++
 +asyncdefslow_guardrail_func(context, agent, output):
 +nonlocaltasks_cancelled
 +tasks_started.set()
 +try:
 +awaitasyncio.sleep(10)
 +returnGuardrailFunctionOutput(output_info={}, tripwire_triggered=False)
 +exceptasyncio.CancelledError:
 +tasks_cancelled+=1
 +raise
++
 +guardrail=OutputGuardrail(guardrail_function=slow_guardrail_func, name="slow_guardrail")
++
 +run_config: RealtimeRunConfig={
 +"output_guardrails": [guardrail],
 +"guardrails_settings":{"debounce_text_length": 5},
 + }
++
 +session=RealtimeSession(mock_model, mock_agent, None, run_config=run_config)
++
 +# Trigger guardrails multiple times to create multiple tasks
 +foriinrange(3):
 +transcript_event=RealtimeModelTranscriptDeltaEvent(
 +item_id=f"item_{i}", delta="hello world", response_id=f"resp_{i}"
 + )
 +awaitsession.on_event(transcript_event)
++
 +# Wait for at least one task to start
 +awaitasyncio.wait_for(tasks_started.wait(), timeout=1.0)
++
 +# Should have at least one guardrail task
 +initial_task_count=len(session._guardrail_tasks)
 +assertinitial_task_count>=1, "At least one guardrail task should exist"
++
 +# Cleanup should cancel and await all tasks
 +awaitsession._cleanup_guardrail_tasks()
++
 +# Verify all tasks were cancelled and cleared
 +asserttasks_cancelled>=1, "At least one task should have been cancelled"
 +assertlen(session._guardrail_tasks) ==0