fix(evaluation): forward App through to the eval Runner

saifer82 · saifer82 · commit f02830579bfb · 2026-04-28T22:23:52.000+01:00
`_generate_inferences_from_root_agent` now accepts an optional `app`
parameter. When provided, the eval Runner is built from a copy of the
App with internal eval plugins (`_RequestIntercepterPlugin`,
`EnsureRetryOptionsPlugin`) merged into `app.plugins`. The user's App
is never mutated, and the App's `context_cache_config` /
`resumability_config` ride along automatically. When `app` is None,
the legacy bare-agent path is preserved.

`_process_query` (used by the public `generate_responses` entry point)
now resolves `agent.app` first and forwards it to the helper, so
projects that wrap their root agent in an `App` get plugin coverage
during eval without further changes.

The CLI plumbing that hands the App down from `cli_eval` /
`LocalEvalService` is in the next commit.
diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
@@ -26,6 +26,7 @@
 from pydantic import BaseModel
 
 from ..agents.llm_agent import Agent
+from ..apps.app import App
 from ..artifacts.base_artifact_service import BaseArtifactService
 from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
 from ..events.event import Event
@@ -143,7 +144,15 @@ async def _process_query(
     """Process a query using the agent and evaluation dataset."""
     module_path = f"{module_name}"
     agent_module = importlib.import_module(module_path)
-    root_agent = agent_module.agent.root_agent
+    # Prefer the wrapping `App` when the module exposes one, so that
+    # `app.plugins`, context-cache, and resumability configs participate
+    # in eval runs the same way they do for `adk web` / `adk run`.
+    app_obj = getattr(agent_module.agent, "app", None)
+    if isinstance(app_obj, App):
+      root_agent = app_obj.root_agent
+    else:
+      app_obj = None
+      root_agent = agent_module.agent.root_agent
 
     reset_func = getattr(agent_module.agent, "reset_data", None)
 
@@ -157,6 +166,7 @@ async def _process_query(
         user_simulator=user_simulator,
         reset_func=reset_func,
         initial_session=initial_session,
+        app=app_obj,
     )
 
   @staticmethod
@@ -197,8 +207,17 @@ async def _generate_inferences_from_root_agent(
       session_service: Optional[BaseSessionService] = None,
       artifact_service: Optional[BaseArtifactService] = None,
       memory_service: Optional[BaseMemoryService] = None,
+      app: Optional[App] = None,
   ) -> list[Invocation]:
-    """Scrapes the root agent in coordination with the user simulator."""
+    """Scrapes the root agent in coordination with the user simulator.
+
+    If `app` is provided, the eval Runner is built from a copy of the App
+    with internal eval plugins merged into `app.plugins`, preserving the
+    App's `context_cache_config`, `resumability_config`, and any other
+    application-wide configuration. Otherwise the Runner is built from
+    the bare `root_agent` with only the internal eval plugins, matching
+    the legacy behavior.
+    """
 
     if not session_service:
       session_service = InMemorySessionService()
@@ -235,13 +254,39 @@ async def _generate_inferences_from_root_agent(
     ensure_retry_options_plugin = EnsureRetryOptionsPlugin(
         name="ensure_retry_options"
     )
+    internal_eval_plugins = [
+        request_intercepter_plugin,
+        ensure_retry_options_plugin,
+    ]
+
+    if app is not None:
+      # Copy the App so we don't mutate the user's instance, and merge our
+      # internal eval plugins with the user's. Override `root_agent` so the
+      # Runner targets the agent the caller actually asked us to evaluate
+      # (e.g., a sub-agent), while still carrying the App's plugins,
+      # context_cache_config, and resumability_config.
+      runner_app = app.model_copy(
+          update={
+              "plugins": list(app.plugins) + internal_eval_plugins,
+              "root_agent": root_agent,
+          }
+      )
+      runner_kwargs: dict[str, Any] = {
+          "app": runner_app,
+          "app_name": app_name,
+      }
+    else:
+      runner_kwargs = {
+          "app_name": app_name,
+          "agent": root_agent,
+          "plugins": internal_eval_plugins,
+      }
+
     async with Runner(
-        app_name=app_name,
-        agent=root_agent,
+        **runner_kwargs,
         artifact_service=artifact_service,
         session_service=session_service,
         memory_service=memory_service,
-        plugins=[request_intercepter_plugin, ensure_retry_options_plugin],
     ) as runner:
       events = []
       while True:
diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py
@@ -14,10 +14,13 @@
 
 from __future__ import annotations
 
+from google.adk.agents.base_agent import BaseAgent
+from google.adk.apps.app import App
 from google.adk.evaluation.app_details import AgentDetails
 from google.adk.evaluation.app_details import AppDetails
 from google.adk.evaluation.evaluation_generator import EvaluationGenerator
 from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin
+from google.adk.plugins.base_plugin import BasePlugin
 from google.adk.evaluation.simulation.user_simulator import NextUserMessage
 from google.adk.evaluation.simulation.user_simulator import Status as UserSimulatorStatus
 from google.adk.evaluation.simulation.user_simulator import UserSimulator
@@ -479,3 +482,133 @@ async def mock_generate_inferences_side_effect(
     mock_generate_inferences.assert_called_once()
     called_with_content = mock_generate_inferences.call_args.args[3]
     assert called_with_content.parts[0].text == "message 1"
+
+
+class _SpyPlugin(BasePlugin):
+  """A user-defined plugin used to assert merge behavior."""
+
+  pass
+
+
+class TestGenerateInferencesFromRootAgentWithApp:
+  """Tests that App.plugins / configs are honored when an App is provided."""
+
+  @pytest.fixture
+  def runner_cls(self, mocker):
+    """Patches Runner and returns the patched class for kwargs inspection."""
+    mock_runner_cls = mocker.patch(
+        "google.adk.evaluation.evaluation_generator.Runner"
+    )
+    mock_runner_instance = mocker.AsyncMock()
+    mock_runner_instance.__aenter__.return_value = mock_runner_instance
+    mock_runner_cls.return_value = mock_runner_instance
+    yield mock_runner_cls
+
+  @pytest.fixture
+  def stop_immediately_simulator(self, mocker):
+    """Returns a UserSimulator that stops on first call (no inference work)."""
+    sim = mocker.MagicMock(spec=UserSimulator)
+    sim.get_next_user_message = mocker.AsyncMock(
+        return_value=NextUserMessage(
+            status=UserSimulatorStatus.STOP_SIGNAL_DETECTED
+        )
+    )
+    return sim
+
+  @pytest.mark.asyncio
+  async def test_runner_built_from_app_when_provided(
+      self, runner_cls, mock_session_service, stop_immediately_simulator
+  ):
+    """When `app` is passed, Runner is built with `app=` (merged) instead of `agent=`."""
+    root_agent = BaseAgent(name="root_agent")
+    user_plugin = _SpyPlugin(name="user_plugin")
+    app = App(name="my_app", root_agent=root_agent, plugins=[user_plugin])
+
+    await EvaluationGenerator._generate_inferences_from_root_agent(
+        root_agent=root_agent,
+        user_simulator=stop_immediately_simulator,
+        app=app,
+    )
+
+    runner_cls.assert_called_once()
+    kwargs = runner_cls.call_args.kwargs
+    assert "agent" not in kwargs, (
+        "Runner must not receive `agent=` when `app=` is provided "
+        "(would raise ValueError)."
+    )
+    assert "plugins" not in kwargs, (
+        "Runner must not receive `plugins=` when `app=` is provided "
+        "(would raise ValueError)."
+    )
+    runner_app = kwargs["app"]
+    assert isinstance(runner_app, App)
+    plugin_names = [p.name for p in runner_app.plugins]
+    assert "user_plugin" in plugin_names, (
+        "User plugin must be preserved in the merged App passed to Runner."
+    )
+    assert "request_intercepter_plugin" in plugin_names
+    assert "ensure_retry_options" in plugin_names
+
+  @pytest.mark.asyncio
+  async def test_user_app_is_not_mutated(
+      self, runner_cls, mock_session_service, stop_immediately_simulator
+  ):
+    """The user's App instance must not be mutated across eval runs."""
+    root_agent = BaseAgent(name="root_agent")
+    user_plugin = _SpyPlugin(name="user_plugin")
+    app = App(name="my_app", root_agent=root_agent, plugins=[user_plugin])
+    original_plugins_id = id(app.plugins)
+
+    for _ in range(3):
+      await EvaluationGenerator._generate_inferences_from_root_agent(
+          root_agent=root_agent,
+          user_simulator=stop_immediately_simulator,
+          app=app,
+      )
+
+    # The user's App instance must still hold exactly its original plugin set,
+    # regardless of how many eval runs reused it.
+    assert app.plugins == [user_plugin]
+    assert id(app.plugins) == original_plugins_id
+
+  @pytest.mark.asyncio
+  async def test_runner_falls_back_to_bare_agent_when_no_app(
+      self, runner_cls, mock_session_service, stop_immediately_simulator
+  ):
+    """When `app` is None, Runner is built with the legacy `agent=`/`plugins=` shape."""
+    root_agent = BaseAgent(name="root_agent")
+
+    await EvaluationGenerator._generate_inferences_from_root_agent(
+        root_agent=root_agent,
+        user_simulator=stop_immediately_simulator,
+    )
+
+    runner_cls.assert_called_once()
+    kwargs = runner_cls.call_args.kwargs
+    assert "app" not in kwargs
+    assert kwargs["agent"] is root_agent
+    plugin_names = [p.name for p in kwargs["plugins"]]
+    assert plugin_names == [
+        "request_intercepter_plugin",
+        "ensure_retry_options",
+    ]
+
+  @pytest.mark.asyncio
+  async def test_root_agent_override_propagates_to_merged_app(
+      self, runner_cls, mock_session_service, stop_immediately_simulator
+  ):
+    """If a sub-agent is passed as root_agent, the merged App reflects that."""
+    full_root = BaseAgent(name="full_root")
+    sub_agent = BaseAgent(name="sub_agent")
+    app = App(name="my_app", root_agent=full_root)
+
+    await EvaluationGenerator._generate_inferences_from_root_agent(
+        root_agent=sub_agent,
+        user_simulator=stop_immediately_simulator,
+        app=app,
+    )
+
+    runner_app = runner_cls.call_args.kwargs["app"]
+    assert runner_app.root_agent is sub_agent
+    # User's App must be untouched.
+    assert app.root_agent is full_root