Skip to main content
"""Eval model metrics accumulated into run_output.metrics.details["eval_model"]."""

from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval
from agno.models.openai import OpenAIChat
from rich.pretty import pprint

agent = Agent(
    model=OpenAIChat(id="gpt-4o-mini"),
    instructions="Answer factual questions concisely.",
)

evaluation = AccuracyEval(
    name="Capital Cities",
    model=OpenAIChat(id="gpt-4o-mini"),
    agent=agent,
    input="What is the capital of Japan?",
    expected_output="Tokyo",
    num_iterations=1,
)

if __name__ == "__main__":
    # First, run the agent to get a response
    run_output = agent.run("What is the capital of Japan?")
    agent_output = str(run_output.content)

    # Run the evaluator, passing run_output so eval metrics accumulate into it
    evaluator_agent = evaluation.get_evaluator_agent()
    eval_input = evaluation.get_eval_input()
    eval_expected = evaluation.get_eval_expected_output()

    evaluation_input = (
        f"<agent_input>\n{eval_input}\n</agent_input>\n\n"
        f"<expected_output>\n{eval_expected}\n</expected_output>\n\n"
        f"<agent_output>\n{agent_output}\n</agent_output>"
    )

    result = evaluation.evaluate_answer(
        input=eval_input,
        evaluator_agent=evaluator_agent,
        evaluation_input=evaluation_input,
        evaluator_expected_output=eval_expected,
        agent_output=agent_output,
        run_metrics=run_output.metrics,
    )

    if result:
        print(f"Score: {result.score}/10")
        print(f"Reason: {result.reason[:200]}")

    # The run_output now has both agent + eval metrics
    if run_output.metrics:
        print("\nTotal tokens (agent + eval):", run_output.metrics.total_tokens)

        if run_output.metrics.details:
            if "model" in run_output.metrics.details:
                agent_tokens = sum(
                    metric.total_tokens
                    for metric in run_output.metrics.details["model"]
                )
                print("Agent model tokens:", agent_tokens)

            if "eval_model" in run_output.metrics.details:
                eval_tokens = sum(
                    metric.total_tokens
                    for metric in run_output.metrics.details["eval_model"]
                )
                print("Eval model tokens:", eval_tokens)

            print("\nFull metrics breakdown:")
            pprint(run_output.metrics.to_dict())

Run the Example

# Clone and setup repo
git clone https://github.com/agno-agi/agno.git
cd agno/cookbook/09_evals/accuracy

# Create and activate virtual environment
./scripts/demo_setup.sh
source .venvs/demo/bin/activate

python accuracy_eval_metrics.py