diff --git a/arcade/arcade/cli/utils.py b/arcade/arcade/cli/utils.py
index 67dbc2d0..eb03a6fd 100644
--- a/arcade/arcade/cli/utils.py
+++ b/arcade/arcade/cli/utils.py
@@ -224,10 +224,6 @@ def _format_evaluation(evaluation: "EvaluationResult") -> str:
         A formatted string representation of the evaluation details.
     """
     result_lines = []
-
-    # Include overall final score
-    result_lines.append(f"[bold]Final Score:[/bold] {evaluation.score:.2f}\n")
-
     for critic_result in evaluation.results:
         match_color = "green" if critic_result["match"] else "red"
         field = critic_result["field"]
diff --git a/toolkits/math/evals/eval_arithmetic_tools.py b/toolkits/math/evals/eval_arithmetic_tools.py
new file mode 100644
index 00000000..c344a9fa
--- /dev/null
+++ b/toolkits/math/evals/eval_arithmetic_tools.py
@@ -0,0 +1,65 @@
+from arcade.core.catalog import ToolCatalog
+from arcade_arithmetic.tools.arithmetic import add, sqrt
+
+from arcade.sdk.eval import (
+    BinaryCritic,
+    EvalRubric,
+    EvalSuite,
+    ExpectedToolCall,
+    tool_eval,
+)
+
+# Evaluation rubric
+rubric = EvalRubric(
+    fail_threshold=0.85,
+    warn_threshold=0.95,
+)
+
+
+# TODO: add_toolkit didn't work
+catalog = ToolCatalog()
+catalog.add_tool(add)
+catalog.add_tool(sqrt)
+
+
+@tool_eval("gpt-4o-mini")
+def arithmetic_eval_suite():
+    suite = EvalSuite(
+        name="Arithmetic Tools Evaluation",
+        system="You are an AI assistant with access to arithmetic tools. Use them to help the user with their math-related tasks.",
+        catalog=catalog,
+        rubric=rubric,
+    )
+
+    suite.add_case(
+        name="Add two large numbers",
+        user_message="Add 12345 and 987654321",
+        expected_tool_calls=[
+            ExpectedToolCall(
+                "Add",
+                args={
+                    "a": 12345,
+                    "b": 987654321,
+                },
+            )
+        ],
+        rubric=rubric,
+        critics=[
+            BinaryCritic(
+                critic_field="a", weight=0.5
+            ),  # TODO: weight should be optional
+            BinaryCritic(critic_field="b", weight=0.5),
+        ],
+    )
+
+    suite.add_case(
+        name="Take the square root of a large number",
+        user_message="What is the square root of 3224990521?",
+        expected_tool_calls=[ExpectedToolCall(lambda: sqrt(3224990521))],
+        rubric=rubric,
+        critics=[
+            BinaryCritic(critic_field="a", weight=1.0),
+        ],
+    )
+
+    return suite
diff --git a/toolkits/slack/arcade_slack/tools/chat.py b/toolkits/slack/arcade_slack/tools/chat.py
index cdc76489..82537278 100644
--- a/toolkits/slack/arcade_slack/tools/chat.py
+++ b/toolkits/slack/arcade_slack/tools/chat.py
@@ -21,7 +21,10 @@ from arcade.sdk.auth import SlackUser
 )
 def send_dm_to_user(
     context: ToolContext,
-    user_name: Annotated[str, "The Slack username of the person you want to message"],
+    user_name: Annotated[
+        str,
+        "The Slack username of the person you want to message. Slack usernames are ALWAYS lowercase.",
+    ],
     message: Annotated[str, "The message you want to send"],
 ):
     """Send a direct message to a user in Slack."""
@@ -82,7 +85,8 @@ def format_users(userListResponse: dict) -> str:
 def send_message_to_channel(
     context: ToolContext,
     channel_name: Annotated[
-        str, "The Slack channel name where you want to send the message"
+        str,
+        "The Slack channel name where you want to send the message. Slack channel names are ALWAYS lowercase.",
     ],
     message: Annotated[str, "The message you want to send"],
 ):
diff --git a/toolkits/slack/evals/eval_slack_messaging.py b/toolkits/slack/evals/eval_slack_messaging.py
index e0c64aa8..3600b2c8 100644
--- a/toolkits/slack/evals/eval_slack_messaging.py
+++ b/toolkits/slack/evals/eval_slack_messaging.py
@@ -65,8 +65,8 @@ def slack_eval_suite() -> EvalSuite:
             )
         ],
         critics=[
-            SimilarityCritic(critic_field="user_name", weight=0.4),
-            SimilarityCritic(critic_field="message", weight=0.6),
+            SimilarityCritic(critic_field="user_name", weight=0.6),
+            SimilarityCritic(critic_field="message", weight=0.4),
         ],
     )
 
@@ -83,8 +83,8 @@ def slack_eval_suite() -> EvalSuite:
             )
         ],
         critics=[
-            BinaryCritic(critic_field="user_name", weight=0.5),
-            SimilarityCritic(critic_field="message", weight=0.5),
+            BinaryCritic(critic_field="user_name", weight=0.6),
+            SimilarityCritic(critic_field="message", weight=0.4),
         ],
     )
 
@@ -102,8 +102,8 @@ def slack_eval_suite() -> EvalSuite:
             )
         ],
         critics=[
-            BinaryCritic(critic_field="channel_name", weight=0.5),
-            SimilarityCritic(critic_field="message", weight=0.5),
+            BinaryCritic(critic_field="channel_name", weight=0.6),
+            SimilarityCritic(critic_field="message", weight=0.4),
         ],
     )
 
@@ -165,8 +165,10 @@ def slack_eval_suite() -> EvalSuite:
             ),
         ],
         critics=[
-            SimilarityCritic(critic_field="user_name", weight=0.4),
-            SimilarityCritic(critic_field="message", weight=0.6),
+            SimilarityCritic(critic_field="user_name", weight=0.6),
+            SimilarityCritic(
+                critic_field="message", weight=0.4, similarity_threshold=0.7
+            ),
         ],
     )