Tool-Using Agent-as-Judge Evaluation

"""
Tool-Using Agent-as-Judge Evaluation
====================================

Demonstrates judging responses generated by an agent using tools.
"""

from typing import Optional

from agno.agent import Agent
from agno.eval.agent_as_judge import AgentAsJudgeEval, AgentAsJudgeResult
from agno.models.openai import OpenAIChat
from agno.tools.calculator import CalculatorTools

# ---------------------------------------------------------------------------
# Create Agent
# ---------------------------------------------------------------------------
agent = Agent(
    model=OpenAIChat(id="gpt-4o"),
    tools=[CalculatorTools()],
    instructions="Use the calculator tools to solve math problems. Explain your reasoning and show calculation steps clearly.",
)

# ---------------------------------------------------------------------------
# Create Evaluation
# ---------------------------------------------------------------------------
evaluation = AgentAsJudgeEval(
    name="Calculator Tool Usage Quality",
    model=OpenAIChat(id="gpt-5.2"),
    criteria="Response should clearly explain the calculation process, show intermediate steps, and present the final answer in a user-friendly way",
    scoring_strategy="numeric",
    threshold=7,
)

# ---------------------------------------------------------------------------
# Run Evaluation
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    response = agent.run("What is 15 * 23 + 47?")
    result: Optional[AgentAsJudgeResult] = evaluation.run(
        input="What is 15 * 23 + 47?",
        output=str(response.content),
        print_results=True,
    )
    assert result is not None, "Evaluation should return a result"

Run the Example

# Clone and setup repo
git clone https://github.com/agno-agi/agno.git
cd agno/cookbook/09_evals/agent_as_judge

# Create and activate virtual environment
./scripts/demo_setup.sh
source .venvs/demo/bin/activate

python agent_as_judge_with_tools.py

Examples

Primitives

Context

Models

Tools

More

Tool-Using Agent-as-Judge Evaluation

Run the Example

Examples

Primitives

Context

Models

Tools

More

​Run the Example

Run the Example