diff --git a/arcade/arcade/cli/utils.py b/arcade/arcade/cli/utils.py index 67dbc2d0..eb03a6fd 100644 --- a/arcade/arcade/cli/utils.py +++ b/arcade/arcade/cli/utils.py @@ -224,10 +224,6 @@ def _format_evaluation(evaluation: "EvaluationResult") -> str: A formatted string representation of the evaluation details. """ result_lines = [] - - # Include overall final score - result_lines.append(f"[bold]Final Score:[/bold] {evaluation.score:.2f}\n") - for critic_result in evaluation.results: match_color = "green" if critic_result["match"] else "red" field = critic_result["field"] diff --git a/toolkits/math/evals/eval_arithmetic_tools.py b/toolkits/math/evals/eval_arithmetic_tools.py new file mode 100644 index 00000000..c344a9fa --- /dev/null +++ b/toolkits/math/evals/eval_arithmetic_tools.py @@ -0,0 +1,65 @@ +from arcade.core.catalog import ToolCatalog +from arcade_arithmetic.tools.arithmetic import add, sqrt + +from arcade.sdk.eval import ( + BinaryCritic, + EvalRubric, + EvalSuite, + ExpectedToolCall, + tool_eval, +) + +# Evaluation rubric +rubric = EvalRubric( + fail_threshold=0.85, + warn_threshold=0.95, +) + + +# TODO: add_toolkit didn't work +catalog = ToolCatalog() +catalog.add_tool(add) +catalog.add_tool(sqrt) + + +@tool_eval("gpt-4o-mini") +def arithmetic_eval_suite(): + suite = EvalSuite( + name="Arithmetic Tools Evaluation", + system="You are an AI assistant with access to arithmetic tools. Use them to help the user with their math-related tasks.", + catalog=catalog, + rubric=rubric, + ) + + suite.add_case( + name="Add two large numbers", + user_message="Add 12345 and 987654321", + expected_tool_calls=[ + ExpectedToolCall( + "Add", + args={ + "a": 12345, + "b": 987654321, + }, + ) + ], + rubric=rubric, + critics=[ + BinaryCritic( + critic_field="a", weight=0.5 + ), # TODO: weight should be optional + BinaryCritic(critic_field="b", weight=0.5), + ], + ) + + suite.add_case( + name="Take the square root of a large number", + user_message="What is the square root of 3224990521?", + expected_tool_calls=[ExpectedToolCall(lambda: sqrt(3224990521))], + rubric=rubric, + critics=[ + BinaryCritic(critic_field="a", weight=1.0), + ], + ) + + return suite diff --git a/toolkits/slack/arcade_slack/tools/chat.py b/toolkits/slack/arcade_slack/tools/chat.py index cdc76489..82537278 100644 --- a/toolkits/slack/arcade_slack/tools/chat.py +++ b/toolkits/slack/arcade_slack/tools/chat.py @@ -21,7 +21,10 @@ from arcade.sdk.auth import SlackUser ) def send_dm_to_user( context: ToolContext, - user_name: Annotated[str, "The Slack username of the person you want to message"], + user_name: Annotated[ + str, + "The Slack username of the person you want to message. Slack usernames are ALWAYS lowercase.", + ], message: Annotated[str, "The message you want to send"], ): """Send a direct message to a user in Slack.""" @@ -82,7 +85,8 @@ def format_users(userListResponse: dict) -> str: def send_message_to_channel( context: ToolContext, channel_name: Annotated[ - str, "The Slack channel name where you want to send the message" + str, + "The Slack channel name where you want to send the message. Slack channel names are ALWAYS lowercase.", ], message: Annotated[str, "The message you want to send"], ): diff --git a/toolkits/slack/evals/eval_slack_messaging.py b/toolkits/slack/evals/eval_slack_messaging.py index e0c64aa8..3600b2c8 100644 --- a/toolkits/slack/evals/eval_slack_messaging.py +++ b/toolkits/slack/evals/eval_slack_messaging.py @@ -65,8 +65,8 @@ def slack_eval_suite() -> EvalSuite: ) ], critics=[ - SimilarityCritic(critic_field="user_name", weight=0.4), - SimilarityCritic(critic_field="message", weight=0.6), + SimilarityCritic(critic_field="user_name", weight=0.6), + SimilarityCritic(critic_field="message", weight=0.4), ], ) @@ -83,8 +83,8 @@ def slack_eval_suite() -> EvalSuite: ) ], critics=[ - BinaryCritic(critic_field="user_name", weight=0.5), - SimilarityCritic(critic_field="message", weight=0.5), + BinaryCritic(critic_field="user_name", weight=0.6), + SimilarityCritic(critic_field="message", weight=0.4), ], ) @@ -102,8 +102,8 @@ def slack_eval_suite() -> EvalSuite: ) ], critics=[ - BinaryCritic(critic_field="channel_name", weight=0.5), - SimilarityCritic(critic_field="message", weight=0.5), + BinaryCritic(critic_field="channel_name", weight=0.6), + SimilarityCritic(critic_field="message", weight=0.4), ], ) @@ -165,8 +165,10 @@ def slack_eval_suite() -> EvalSuite: ), ], critics=[ - SimilarityCritic(critic_field="user_name", weight=0.4), - SimilarityCritic(critic_field="message", weight=0.6), + SimilarityCritic(critic_field="user_name", weight=0.6), + SimilarityCritic( + critic_field="message", weight=0.4, similarity_threshold=0.7 + ), ], )