Binary Agent-as-Judge Evaluation

"""
Binary Agent-as-Judge Evaluation
================================

Demonstrates pass/fail response quality evaluation.
"""

from agno.agent import Agent
from agno.db.sqlite import SqliteDb
from agno.eval.agent_as_judge import AgentAsJudgeEval
from agno.models.openai import OpenAIChat

# ---------------------------------------------------------------------------
# Create Database
# ---------------------------------------------------------------------------
db = SqliteDb(db_file="tmp/agent_as_judge_binary.db")

# ---------------------------------------------------------------------------
# Create Agent
# ---------------------------------------------------------------------------
agent = Agent(
    model=OpenAIChat(id="gpt-4o"),
    instructions="You are a customer service agent. Respond professionally.",
    db=db,
)

# ---------------------------------------------------------------------------
# Create Evaluation
# ---------------------------------------------------------------------------
evaluation = AgentAsJudgeEval(
    name="Professional Tone Check",
    criteria="Response must maintain professional tone without informal language or slang",
    db=db,
)

# ---------------------------------------------------------------------------
# Run Evaluation
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    response = agent.run("I need help with my account")
    result = evaluation.run(
        input="I need help with my account",
        output=str(response.content),
        print_results=True,
        print_summary=True,
    )
    print(f"Result: {'PASSED' if result.results[0].passed else 'FAILED'}")

Run the Example

# Clone and setup repo
git clone https://github.com/agno-agi/agno.git
cd agno/cookbook/09_evals/agent_as_judge

# Create and activate virtual environment
./scripts/demo_setup.sh
source .venvs/demo/bin/activate

python agent_as_judge_binary.py

Examples

Primitives

Context

Models

Tools

More

Binary Agent-as-Judge Evaluation

Run the Example

Examples

Primitives

Context

Models

Tools

More

​Run the Example

Run the Example