arcade-mcp/toolkits/math/evals/eval_math_tools.py
Eric Gustin 8795871d51
Check if toolkit version changed before attempting publish (#198)
# PR Description
Changes to a toolkit without changes to the toolkit's version fail the
'Publish Toolkit' workflow with `HTTP Error 400: File already exists
('arcade_zoom-0.1.7.tar.gz', with blake2_256 hash
'02183cda607f06616e7edb17e3d22bc11d1d83b074b3e44066b78ec72602fb37'). See
https://pypi.org/help/#file-name-reuse for more information.`, for
example.

This PR adds the `--skip-existing` flag to `poetry publish` to avoid
attempting to publish an existing version. Skips slack notification if
publish is skipped.


The `grep`'d string comes from
https://github.com/python-poetry/poetry/blob/main/src/poetry/publishing/uploader.py#L246-L249
2025-01-13 10:00:24 -08:00

69 lines
1.6 KiB
Python

from arcade.sdk import ToolCatalog
from arcade.sdk.eval import (
BinaryCritic,
EvalRubric,
EvalSuite,
ExpectedToolCall,
tool_eval,
)
import arcade_math
from arcade_math.tools.arithmetic import add, sqrt
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.85,
warn_threshold=0.95,
)
catalog = ToolCatalog()
catalog.add_module(arcade_math)
@tool_eval()
def math_eval_suite():
suite = EvalSuite(
name="Math Tools Evaluation",
system_message="You're an AI assistant with access to math tools. Use them to help the user with their math-related tasks.",
catalog=catalog,
rubric=rubric,
)
suite.add_case(
name="Add two large numbers",
user_message="Add 12345 and 987654321",
expected_tool_calls=[
ExpectedToolCall(
func=add,
args={
"a": 12345,
"b": 987654321,
},
)
],
rubric=rubric,
critics=[
BinaryCritic(critic_field="a", weight=0.5), # TODO: weight should be optional
BinaryCritic(critic_field="b", weight=0.5),
],
)
suite.add_case(
name="Take the square root of a large number",
user_message="What is the square root of 3224990521?",
expected_tool_calls=[
ExpectedToolCall(
func=sqrt,
args={
"a": 3224990521,
},
)
],
rubric=rubric,
critics=[
BinaryCritic(critic_field="a", weight=1.0),
],
)
return suite