arcade-mcp/toolkits/math/evals/eval_math_tools.py

import arcade_math
from arcade_math.tools.arithmetic import add, sqrt

from arcade.core.catalog import ToolCatalog
from arcade.sdk.eval import (
    BinaryCritic,
    EvalRubric,
    EvalSuite,
    tool_eval,
)

# Evaluation rubric
rubric = EvalRubric(
    fail_threshold=0.85,
    warn_threshold=0.95,
)


catalog = ToolCatalog()
catalog.add_module(arcade_math)


@tool_eval()
def math_eval_suite():
    suite = EvalSuite(
        name="Math Tools Evaluation",
        system_message="You are an AI assistant with access to math tools. Use them to help the user with their math-related tasks.",
        catalog=catalog,
        rubric=rubric,
    )

    suite.add_case(
        name="Add two large numbers",
        user_message="Add 12345 and 987654321",
        expected_tool_calls=[
            (
                add,
                {
                    "a": 12345,
                    "b": 987654321,
                },
            )
        ],
        rubric=rubric,
        critics=[
            BinaryCritic(critic_field="a", weight=0.5),  # TODO: weight should be optional
            BinaryCritic(critic_field="b", weight=0.5),
        ],
    )

    suite.add_case(
        name="Take the square root of a large number",
        user_message="What is the square root of 3224990521?",
        expected_tool_calls=[
            (
                sqrt,
                {
                    "a": 3224990521,
                },
            )
        ],
        rubric=rubric,
        critics=[
            BinaryCritic(critic_field="a", weight=1.0),
        ],
    )

    return suite