From 2eb46a3a9849e412a2290fb2b478ea033d015000 Mon Sep 17 00:00:00 2001
From: Sam Partee <sam@arcade-ai.com>
Date: Tue, 24 Sep 2024 10:13:45 -0700
Subject: [PATCH] Client Fixes and LangGraph Examples (#50)

This PR includes several improvements to the Arcade client and adds
LangGraph examples:

1. Enhanced error handling in the Arcade client:
   - Improved HTTP error handling in `BaseArcadeClient`
- Simplified request methods in `SyncArcadeClient` and
`AsyncArcadeClient`

2. Updated `ToolResource` class:
   - Changed base path from `/v1/tool` to `/v1/tools`
   - Added `tool_version` parameter to `authorize` method

3. Improved Toolkit discovery:
- Updated `find_all_arcade_toolkits` to search only in the current
Python interpreter's site-packages

5. Added LangGraph examples:
   - New `langgraph_auth.py` example demonstrating Gmail authentication
- New `langgraph_with_tool_exec.py` example showing tool execution
within a LangGraph

6. Minor updates:
   - Changed default `BASE_URL` to `https://api.arcade.com/`
   - Updated import error message for eval dependencies

---------

Co-authored-by: Nate Barbettini <nate@arcade-ai.com>
---
 .vscode/launch.json                           |   6 +-
 arcade/arcade/actor/core/base.py              |   2 +-
 arcade/arcade/cli/launcher.py                 | 378 ++++++++++++++++++
 arcade/arcade/cli/main.py                     | 136 +++++--
 arcade/arcade/cli/serve.py                    | 105 ++++-
 arcade/arcade/cli/utils.py                    |  31 +-
 arcade/arcade/client/base.py                  |  43 +-
 arcade/arcade/client/client.py                |  68 ++--
 arcade/arcade/client/schema.py                |   3 +
 arcade/arcade/core/catalog.py                 |  23 +-
 arcade/arcade/core/config_model.py            |  32 +-
 arcade/arcade/core/env.py                     |  20 -
 arcade/arcade/core/schema.py                  |   4 +-
 arcade/arcade/core/toolkit.py                 |   9 +-
 arcade/arcade/sdk/__init__.py                 |  16 -
 arcade/arcade/sdk/eval/eval.py                |  92 ++++-
 arcade/pyproject.toml                         |   7 +-
 arcade/tests/client/test_client.py            | 107 ++---
 arcade/tests/core/test_catalog.py             |   6 +-
 docker/Dockerfile                             |   6 +-
 examples/langchain/gmail.py                   |  68 ----
 examples/langchain/langgraph_auth.py          |  60 +++
 .../langchain/langgraph_with_tool_exec.py     |  63 +++
 examples/modal-deploy.py                      |   2 +-
 toolkits/google/pyproject.toml                |   4 +-
 ...arithmetic_tools.py => eval_math_tools.py} |  22 +-
 toolkits/math/pyproject.toml                  |   4 +-
 toolkits/search/evals/eval_google_search.py   | 239 +++++++++++
 toolkits/search/pyproject.toml                |   4 +-
 toolkits/slack/evals/eval_slack_messaging.py  |  97 +++--
 toolkits/slack/pyproject.toml                 |   2 +-
 toolkits/x/evals/eval_x_tools.py              |  35 +-
 32 files changed, 1291 insertions(+), 403 deletions(-)
 create mode 100644 arcade/arcade/cli/launcher.py
 delete mode 100644 arcade/arcade/core/env.py
 delete mode 100644 examples/langchain/gmail.py
 create mode 100644 examples/langchain/langgraph_auth.py
 create mode 100644 examples/langchain/langgraph_with_tool_exec.py
 rename toolkits/math/evals/{eval_arithmetic_tools.py => eval_math_tools.py} (81%)
 create mode 100644 toolkits/search/evals/eval_google_search.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 4bab8cb6..0e597ca4 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -34,15 +34,15 @@
       "cwd": "${workspaceFolder}"
     },
     {
-      "name": "Debug `arcade evals -d`",
+      "name": "Debug `arcade evals -d` on current file",
       "type": "python",
       "request": "launch",
       "program": "${workspaceFolder}/arcade/run_cli.py",
-      "args": ["evals", "-d"],
+      "args": ["evals", "-d", "${fileDirname}", "-h", "localhost"],
       "console": "integratedTerminal",
       "jinja": true,
       "justMyCode": true,
-      "cwd": "${workspaceFolder}"
+      "cwd": ""
     }
   ]
 }
diff --git a/arcade/arcade/actor/core/base.py b/arcade/arcade/actor/core/base.py
index 823a5e0b..c888bb32 100644
--- a/arcade/arcade/actor/core/base.py
+++ b/arcade/arcade/actor/core/base.py
@@ -73,7 +73,7 @@ class BaseActor(Actor):
         """
         return [tool.definition for tool in self.catalog]
 
-    def register_tool(self, tool: Callable, toolkit_name: str | None = None) -> None:
+    def register_tool(self, tool: Callable, toolkit_name: str) -> None:
         """
         Register a tool to the catalog.
         """
diff --git a/arcade/arcade/cli/launcher.py b/arcade/arcade/cli/launcher.py
new file mode 100644
index 00000000..515d1276
--- /dev/null
+++ b/arcade/arcade/cli/launcher.py
@@ -0,0 +1,378 @@
+import io
+import ipaddress
+import logging
+import os
+import shutil
+import signal
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import Callable
+
+from rich.console import Console
+
+console = Console(highlight=False)
+logger = logging.getLogger(__name__)
+
+
+def start_servers(
+    host: str,
+    port: int,
+    engine_config: str | None,
+) -> None:
+    """
+    Start the actor and engine servers.
+
+    Args:
+        host: Host for the actor server.
+        port: Port for the actor server.
+        engine_config: Path to the engine configuration file.
+    """
+    # Validate host and port
+    host = _validate_host(host)
+    port = _validate_port(port)
+
+    # Ensure engine_config is provided and validated
+    engine_config = _get_engine_config(engine_config)
+
+    # Prepare command-line arguments for the actor server and engine
+    actor_cmd = _build_actor_command(host, port)
+    engine_cmd = _build_engine_command(engine_config)
+
+    # Start and manage the processes
+    _manage_processes(actor_cmd, engine_cmd)
+
+
+def _validate_host(host: str) -> str:
+    """
+    Validates the host input.
+
+    Args:
+        host: Host for the actor server.
+
+    Returns:
+        The validated host as a string.
+
+    Raises:
+        ValueError: If the host is invalid.
+    """
+    try:
+        # Validate IP address
+        ipaddress.ip_address(host)
+    except ValueError:
+        # Optionally, validate hostname
+        if not host.isalnum() and "-" not in host and "." not in host:
+            console.print(f"❌ Invalid host: {host}", style="bold red")
+            raise ValueError("Invalid host.")
+    return host
+
+
+def _validate_port(port: int) -> int:
+    """
+    Validates the port input.
+
+    Args:
+        port: Port for the actor server.
+
+    Returns:
+        The validated port as an integer.
+
+    Raises:
+        ValueError: If the port is out of the valid range.
+    """
+    if not (1 <= port <= 65535):
+        console.print(f"❌ Invalid port: {port}", style="bold red")
+        raise ValueError("Invalid port.")
+    return port
+
+
+def _get_engine_config(engine_config: str | None) -> str:
+    """
+    Determines and validates the engine config file path.
+
+    Args:
+        engine_config: Optional path provided by the user.
+
+    Returns:
+        The resolved engine config file path.
+
+    Raises:
+        RuntimeError: If the config file is not found or invalid.
+    """
+    if engine_config:
+        engine_config_path = Path(os.path.expanduser(engine_config)).resolve()
+        if not engine_config_path.is_file():
+            console.print(
+                f"❌ Engine config file not found at {engine_config_path}", style="bold red"
+            )
+            raise RuntimeError("Engine config file not found.")
+    else:
+        # Look for engine.yaml in the current directory
+        engine_config_path = Path(os.getcwd()) / "engine.yaml"
+        if not engine_config_path.is_file():
+            console.print(
+                "❌ Engine config file not specified and not found in current directory.",
+                style="bold red",
+            )
+            raise RuntimeError("Engine config file not specified.")
+    return str(engine_config_path)
+
+
+def _build_actor_command(host: str, port: int) -> list[str]:
+    """
+    Builds the command to start the actor server.
+
+    Args:
+        host: Host for the actor server.
+        port: Port for the actor server.
+
+    Returns:
+        The command as a list.
+    """
+    # Expand full path to "arcade" executable
+    arcade_bin = shutil.which("arcade")
+    if not arcade_bin:
+        console.print(
+            "❌ Arcade binary not found, please install with `pip install arcade-ai`",
+            style="bold red",
+        )
+        sys.exit(1)
+    cmd = [
+        arcade_bin,
+        "dev",
+        "--host",
+        host,
+        "--port",
+        str(port),
+    ]
+    return cmd
+
+
+def _build_engine_command(engine_config: str) -> list[str]:
+    """
+    Builds the command to start the engine.
+
+    Args:
+        engine_config: Path to the engine configuration file.
+
+    Returns:
+        The command as a list.
+    """
+    engine_bin = shutil.which("engine")
+    if not engine_bin:
+        console.print(
+            "❌ Engine binary not found, refer to the installation guide at "
+            "https://docs.arcade-ai.com/docs/home/deployment for how to install the engine",
+            style="bold red",
+        )
+        sys.exit(1)
+    cmd = [
+        engine_bin,
+        "dev",
+        "-c",
+        engine_config,
+    ]
+    return cmd
+
+
+def _manage_processes(actor_cmd: list[str], engine_cmd: list[str]) -> None:
+    """
+    Manages the lifecycle of the actor and engine processes.
+
+    Args:
+        actor_cmd: The command to start the actor server.
+        engine_cmd: The command to start the engine.
+    """
+    actor_process: subprocess.Popen | None = None
+    engine_process: subprocess.Popen | None = None
+
+    def terminate_processes(exit_program: bool = False) -> None:
+        console.print("Terminating child processes...", style="bold yellow")
+        _terminate_process(actor_process)
+        _terminate_process(engine_process)
+        if exit_program:
+            sys.exit(0)
+
+    _setup_signal_handlers(terminate_processes)
+
+    retry_count = 0
+    max_retries = 3  # Define the maximum number of retries
+
+    while retry_count <= max_retries:
+        try:
+            # Start the actor server
+            console.print("Starting actor server...", style="bold green")
+            actor_process = _start_process("Actor", actor_cmd)
+
+            # Wait a bit to ensure actor is up
+            time.sleep(2)
+
+            # Start the engine
+            console.print("Starting engine...", style="bold green")
+            engine_process = _start_process("Engine", engine_cmd)
+
+            # Monitor processes
+            _monitor_processes(actor_process, engine_process)
+
+            # If we reach here, one of the processes has exited
+            retry_count += 1
+            console.print(
+                f"Processes exited. Retry {retry_count} of {max_retries}.", style="bold yellow"
+            )
+
+            if retry_count > max_retries:
+                console.print(f"❌ Exiting after {retry_count - 1} retries", style="bold red")
+                terminate_processes(exit_program=True)
+                break  # Exit the loop
+
+        except Exception as e:
+            console.print(f"❌ Exception occurred: {e}", style="bold red")
+            terminate_processes()
+            retry_count += 1
+            if retry_count > max_retries:
+                console.print(
+                    f"❌ Exiting after {retry_count - 1} retries due to exceptions",
+                    style="bold red",
+                )
+                sys.exit(1)
+                break  # Not strictly necessary, but good practice
+
+    console.print("Exiting...", style="bold red")
+    sys.exit(1)
+
+
+def _start_process(name: str, cmd: list[str]) -> subprocess.Popen:
+    """
+    Starts a subprocess and begins streaming its output.
+
+    Args:
+        name: Name of the process.
+        cmd: Command to execute.
+
+    Returns:
+        The subprocess.Popen object.
+
+    Raises:
+        RuntimeError: If the process fails to start.
+    """
+    try:
+        process = subprocess.Popen(  # noqa: S603, RUF100
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+            bufsize=1,
+            shell=False,
+        )
+        _stream_output(process, name)
+        return process  # noqa: TRY300
+    except Exception as e:
+        console.print(f"❌ Failed to start {name}: {e}", style="bold red")
+        raise RuntimeError(f"Failed to start {name}")
+
+
+def _stream_output(process: subprocess.Popen, name: str) -> None:
+    """
+    Streams the output from a subprocess to the console.
+
+    Args:
+        process: The subprocess.Popen object.
+        name: Name of the process.
+    """
+    stdout_style = "green" if name == "Actor" else "#87CEFA"
+
+    def stream(pipe: io.TextIOWrapper | None, style: str) -> None:
+        if pipe is None:
+            return
+        with pipe:
+            for line in iter(pipe.readline, ""):
+                console.print(f"[{style}]{name}>[/{style}] {line.rstrip()}")
+
+    threading.Thread(target=stream, args=(process.stdout, stdout_style), daemon=True).start()
+    threading.Thread(target=stream, args=(process.stderr, "red"), daemon=True).start()
+
+
+def _monitor_processes(actor_process: subprocess.Popen, engine_process: subprocess.Popen) -> None:
+    """
+    Monitors the actor and engine processes, restarts them if they exit.
+
+    Args:
+        actor_process: The actor subprocess.
+        engine_process: The engine subprocess.
+    """
+    while True:
+        actor_status = actor_process.poll()
+        engine_status = engine_process.poll()
+
+        if actor_status is not None or engine_status is not None:
+            if actor_status is not None:
+                console.print(
+                    f"Actor process exited with code {actor_status}. Restarting both processes...",
+                    style="bold red",
+                )
+            if engine_status is not None:
+                console.print(
+                    f"Engine process exited with code {engine_status}. Restarting both processes...",
+                    style="bold red",
+                )
+            _terminate_process(actor_process)
+            _terminate_process(engine_process)
+            time.sleep(1)
+            break  # Exit to restart both processes
+        else:
+            time.sleep(1)
+
+
+def _terminate_process(process: subprocess.Popen | None) -> None:
+    """
+    Terminates a subprocess if it's running.
+
+    Args:
+        process: The subprocess.Popen object.
+    """
+    if process and process.poll() is None:
+        process.terminate()
+        try:
+            process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            process.kill()
+
+
+def _setup_signal_handlers(terminate_processes: Callable[[bool], None]) -> None:
+    """
+    Setup signal handlers to handle process termination signals.
+
+    Args:
+        terminate_processes: Function to call to terminate child processes.
+    """
+    signals_to_handle = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGHUP"]
+
+    for sig_name in signals_to_handle:
+        sig = getattr(signal, sig_name, None)
+        if sig is None:
+            continue  # Signal not available on this platform
+        try:
+            # Use a lambda to pass the terminate_processes function
+            signal.signal(
+                sig,
+                lambda signum, frame: _handle_signal(signum, terminate_processes),
+            )
+        except (ValueError, RuntimeError):
+            # Signal handling not allowed in this thread or invalid signal
+            console.print(f"Warning: Cannot set handler for {sig_name}", style="bold yellow")
+            continue
+
+
+def _handle_signal(signum: int, terminate_processes: Callable[[bool], None]) -> None:
+    """
+    Handle received signal and terminate child processes.
+
+    Args:
+        signum: The signal number received.
+        terminate_processes: Function to call to terminate child processes.
+    """
+    signal_name = signal.Signals(signum).name
+    console.print(f"Received {signal_name}. Shutting down...", style="bold yellow")
+    terminate_processes(exit_program=True)  # type: ignore[call-arg]
diff --git a/arcade/arcade/cli/main.py b/arcade/arcade/cli/main.py
index 99da63d2..4967faae 100644
--- a/arcade/arcade/cli/main.py
+++ b/arcade/arcade/cli/main.py
@@ -15,6 +15,7 @@ from rich.table import Table
 from rich.text import Text
 
 from arcade.cli.authn import LocalAuthCallbackServer, check_existing_login
+from arcade.cli.launcher import start_servers
 from arcade.cli.utils import (
     OrderCommands,
     apply_config_overrides,
@@ -28,14 +29,41 @@ from arcade.cli.utils import (
 )
 from arcade.client import Arcade
 from arcade.client.errors import EngineNotHealthyError, EngineOfflineError
+from arcade.core.config_model import Config
 
 cli = typer.Typer(
     cls=OrderCommands,
+    add_completion=False,
+    no_args_is_help=True,
+    pretty_exceptions_enable=False,
+    pretty_exceptions_show_locals=False,
+    pretty_exceptions_short=True,
 )
 console = Console()
 
 
-@cli.command(help="Log in to Arcade Cloud")
+def _get_config_with_overrides(
+    force_tls: bool,
+    force_no_tls: bool,
+    host_input: str | None = None,
+    port_input: int | None = None,
+) -> Config:
+    """
+    Get the config with CLI-specific optional overrides applied.
+    """
+    config = validate_and_get_config()
+
+    if not force_tls and not force_no_tls:
+        tls_input = None
+    elif force_no_tls:
+        tls_input = False
+    else:
+        tls_input = True
+    apply_config_overrides(config, host_input, port_input, tls_input)
+    return config
+
+
+@cli.command(help="Log in to Arcade Cloud", rich_help_panel="User")
 def login(
     host: str = typer.Option(
         "cloud.arcade-ai.com",
@@ -74,7 +102,7 @@ def login(
             server_thread.join()  # Ensure the server thread completes and cleans up
 
 
-@cli.command(help="Log out of Arcade Cloud")
+@cli.command(help="Log out of Arcade Cloud", rich_help_panel="User")
 def logout() -> None:
     """
     Logs the user out of Arcade Cloud.
@@ -89,7 +117,7 @@ def logout() -> None:
         console.print("You're not logged in.", style="bold red")
 
 
-@cli.command(help="Create a new toolkit package directory")
+@cli.command(help="Create a new toolkit package directory", rich_help_panel="Tool Development")
 def new(
     directory: str = typer.Option(os.getcwd(), "--dir", help="tools directory path"),
 ) -> None:
@@ -105,7 +133,10 @@ def new(
         console.print(error_message, style="bold red")
 
 
-@cli.command(help="Show the available tools in an actor or toolkit directory")
+@cli.command(
+    help="Show the installed toolkits",
+    rich_help_panel="Tool Development",
+)
 def show(
     toolkit: Optional[str] = typer.Option(
         None, "-t", "--toolkit", help="The toolkit to show the tools of"
@@ -139,12 +170,13 @@ def show(
         console.print(error_message, style="bold red")
 
 
-@cli.command(help="Chat with a language model")
+@cli.command(help="Start Arcade Chat in the terminal", rich_help_panel="Launch")
 def chat(
     model: str = typer.Option("gpt-4o", "-m", help="The model to use for prediction."),
     stream: bool = typer.Option(
         False, "-s", "--stream", is_flag=True, help="Stream the tool output."
     ),
+    debug: bool = typer.Option(False, "--debug", "-d", help="Show debug information"),
     host: str = typer.Option(
         None,
         "-h",
@@ -167,20 +199,11 @@ def chat(
         "--no-tls",
         help="Whether to disable TLS for the connection to the Arcade Engine.",
     ),
-    debug: bool = typer.Option(False, "--debug", "-d", help="Show debug information"),
 ) -> None:
     """
     Chat with a language model.
     """
-    config = validate_and_get_config()
-
-    if not force_tls and not force_no_tls:
-        tls_input = None
-    elif force_no_tls:
-        tls_input = False
-    else:
-        tls_input = True
-    apply_config_overrides(config, host, port, tls_input)
+    config = _get_config_with_overrides(force_tls, force_no_tls, host, port)
 
     client = Arcade(api_key=config.api.key, base_url=config.engine_url)
     user_email = config.user.email if config.user else None
@@ -276,7 +299,7 @@ def chat(
         raise typer.Exit()
 
 
-@cli.command(help="Start an Actor server with specified configurations.")
+@cli.command(help="Start a local Arcade Actor server", rich_help_panel="Launch")
 def dev(
     host: str = typer.Option(
         "127.0.0.1", help="Host for the app, from settings by default.", show_default=True
@@ -300,7 +323,6 @@ def dev(
     try:
         serve_default_actor(host, port, disable_auth)
     except KeyboardInterrupt:
-        console.print("actor stopped by user.", style="bold red")
         typer.Exit()
     except Exception as e:
         error_message = f"❌ Failed to start Arcade Actor: {escape(str(e))}"
@@ -308,7 +330,7 @@ def dev(
         raise typer.Exit(code=1)
 
 
-@cli.command(help="Show/edit configuration details of the Arcade Engine")
+@cli.command(help="Show/edit the local Arcade configuration", rich_help_panel="User")
 def config(
     action: str = typer.Argument("show", help="The action to take (show/edit)"),
     key: str = typer.Option(
@@ -396,7 +418,7 @@ def display_config_as_table(config) -> None:  # type: ignore[no-untyped-def]
     console.print(table)
 
 
-@cli.command(help="Run evaluation suites in a directory")
+@cli.command(help="Run tool calling evaluations", rich_help_panel="Tool Development")
 def evals(
     directory: str = typer.Argument(".", help="Directory containing evaluation files"),
     show_details: bool = typer.Option(False, "--details", "-d", help="Show detailed results"),
@@ -409,11 +431,35 @@ def evals(
     models: str = typer.Option(
         "gpt-4o", "--models", "-m", help="The models to use for evaluation (default: gpt-4o)"
     ),
+    host: str = typer.Option(
+        None,
+        "-h",
+        "--host",
+        help="The Arcade Engine address to send chat requests to.",
+    ),
+    port: int = typer.Option(
+        None,
+        "-p",
+        "--port",
+        help="The port of the Arcade Engine.",
+    ),
+    force_tls: bool = typer.Option(
+        False,
+        "--tls",
+        help="Whether to force TLS for the connection to the Arcade Engine. If not specified, the connection will use TLS if the engine URL uses a 'https' scheme.",
+    ),
+    force_no_tls: bool = typer.Option(
+        False,
+        "--no-tls",
+        help="Whether to disable TLS for the connection to the Arcade Engine.",
+    ),
 ) -> None:
     """
     Find all files starting with 'eval_' in the given directory,
     execute any functions decorated with @tool_eval, and display the results.
     """
+    config = _get_config_with_overrides(force_tls, force_no_tls, host, port)
+
     models = models.split(",")  # type: ignore[assignment]
     eval_files = [f for f in os.listdir(directory) if f.startswith("eval_") and f.endswith(".py")]
 
@@ -421,6 +467,18 @@ def evals(
         console.print("No evaluation files found.", style="bold yellow")
         return
 
+    if show_details:
+        console.print(
+            Text.assemble(
+                ("\nRunning evaluations against Arcade Engine at ", "bold"),
+                (config.engine_url, "bold blue"),
+            )
+        )
+
+    # Try to hit /health endpoint on engine and warn if it is down
+    client = Arcade(api_key=config.api.key, base_url=config.engine_url)
+    log_engine_health(client)
+
     for file in eval_files:
         file_path = os.path.join(directory, file)
         module_name = file[:-3]  # Remove .py extension
@@ -432,17 +490,47 @@ def evals(
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)  # type: ignore[union-attr]
 
-        eval_functions = [
+        eval_suites = [
             obj
             for name, obj in module.__dict__.items()
             if callable(obj) and hasattr(obj, "__tool_eval__")
         ]
 
-        if not eval_functions:
+        if not eval_suites:
             console.print(f"No @tool_eval functions found in {file}", style="bold yellow")
             continue
 
-        for func in eval_functions:
-            console.print(f"\nRunning evaluation from {file}: {func.__name__}", style="bold blue")
-            results = func(models=models, max_concurrency=max_concurrent)
+        if show_details:
+            suite_label = "suite" if len(eval_suites) == 1 else "suites"
+            console.print(f"\nFound {len(eval_suites)} {suite_label} in {file}", style="bold")
+
+        for suite_func in eval_suites:
+            console.print(
+                Text.assemble(
+                    ("\nRunning evaluations in ", "bold"),
+                    (suite_func.__name__, "bold blue"),
+                )
+            )
+            results = suite_func(config=config, models=models, max_concurrency=max_concurrent)
             display_eval_results(results, show_details=show_details)
+
+
+@cli.command(help="Start an Arcade Cluster instance", rich_help_panel="Launch")
+def up(
+    host: str = typer.Option("127.0.0.1", help="Host for the actor server.", show_default=True),
+    port: int = typer.Option(
+        8002, "-p", "--port", help="Port for the actor server.", show_default=True
+    ),
+    engine_config: str = typer.Option(
+        None, "-c", "--config", help="Path to the engine configuration file."
+    ),
+) -> None:
+    """
+    Start both the actor and engine servers.
+    """
+    try:
+        start_servers(host, port, engine_config)
+    except Exception as e:
+        error_message = f"❌ Failed to start servers: {escape(str(e))}"
+        console.print(error_message, style="bold red")
+        raise typer.Exit(code=1)
diff --git a/arcade/arcade/cli/serve.py b/arcade/arcade/cli/serve.py
index 81a07a71..de993876 100644
--- a/arcade/arcade/cli/serve.py
+++ b/arcade/arcade/cli/serve.py
@@ -1,7 +1,11 @@
+import asyncio
 import logging
 import os
+import sys
+from contextlib import asynccontextmanager
+from typing import Any
 
-from rich.console import Console
+from loguru import logger
 
 try:
     import fastapi
@@ -18,29 +22,73 @@ except ImportError:
 from arcade.actor.fastapi.actor import FastAPIActor
 from arcade.core.toolkit import Toolkit
 
-DEVELOPMENT_SECRET = "dev"  # noqa: S105
 
-logger = logging.getLogger(__name__)
-console = Console()
+class InterceptHandler(logging.Handler):
+    def emit(self, record: logging.LogRecord) -> None:
+        # Get corresponding Loguru level if it exists
+        try:
+            level = logger.level(record.levelname).name
+        except ValueError:
+            level = record.levelno  # type: ignore[assignment]
+
+        # Find caller from where originated the logged message
+        frame, depth = sys._getframe(6), 6
+        while frame and frame.f_code.co_filename == logging.__file__:
+            frame = frame.f_back  # type: ignore[assignment]
+            depth += 1
+
+        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+
+
+def setup_logging(log_level: int = logging.INFO) -> None:
+    # Intercept everything at the root logger
+    logging.root.handlers = [InterceptHandler()]
+    logging.root.setLevel(log_level)
+
+    # Remove every other logger's handlers
+    # and propagate to root logger
+    for name in logging.root.manager.loggerDict:
+        logging.getLogger(name).handlers = []
+        logging.getLogger(name).propagate = True
+
+    # Configure loguru with custom format, no colors
+    logger.configure(
+        handlers=[
+            {
+                "sink": sys.stdout,
+                "serialize": False,
+                "level": log_level,
+                "format": "{time:MM-DD HH:mm:ss} | {level: <8} | {message}"
+                + (" {name}:{function}:{line}" if log_level <= logging.DEBUG else "")
+                + ("{exception}\n" if "{exception}" in "{message}" else ""),
+            }
+        ]
+    )
+
+
+@asynccontextmanager
+async def lifespan(app: fastapi.FastAPI):  # type: ignore[no-untyped-def]
+    try:
+        yield
+    except asyncio.CancelledError:
+        # This is necessary to prevent an unhandled error
+        # when the user presses Ctrl+C
+        logger.debug("Lifespan cancelled.")
 
 
 def serve_default_actor(
-    host: str = "127.0.0.1", port: int = 8000, disable_auth: bool = False
+    host: str = "127.0.0.1",
+    port: int = 8002,
+    disable_auth: bool = False,
+    workers: int = 1,
+    timeout_keep_alive: int = 5,
+    **kwargs: Any,
 ) -> None:
     """
     Get an instance of a FastAPI server with the Arcade Actor.
     """
-    # Use Uvicorn's default log config for Arcade logging,
-    # to ensure a nice consistent style for all logs.
-    logging_config = uvicorn.config.LOGGING_CONFIG
-    logging_config["loggers"]["arcade"] = {
-        "handlers": ["default"],
-        "level": "INFO",
-        "propagate": False,
-    }
-
-    # TODO: Pass in a logging config from the CLI, to set the log level.
-    logging.config.dictConfig(logging_config)
+    # Setup unified logging
+    setup_logging()
 
     toolkits = Toolkit.find_all_arcade_toolkits()
     if not toolkits:
@@ -56,12 +104,13 @@ def serve_default_actor(
         logger.warning(
             "Warning: ARCADE_ACTOR_SECRET environment variable is not set. Using 'dev' as the actor secret.",
         )
-        actor_secret = DEVELOPMENT_SECRET
+        actor_secret = actor_secret or "dev"
 
     app = fastapi.FastAPI(
         title="Arcade AI Actor",
         description="Arcade AI default Actor implementation using FastAPI.",
         version="0.1.0",
+        lifespan=lifespan,  # Use custom lifespan to catch errors, notably KeyboardInterrupt (Ctrl+C)
     )
     actor = FastAPIActor(app, secret=actor_secret, disable_auth=disable_auth)
     for toolkit in toolkits:
@@ -69,9 +118,27 @@ def serve_default_actor(
 
     logger.info("Starting FastAPI server...")
 
-    uvicorn.run(
+    class CustomUvicornServer(uvicorn.Server):
+        def install_signal_handlers(self) -> None:
+            pass  # Disable Uvicorn's default signal handlers
+
+    config = uvicorn.Config(
         app=app,
         host=host,
         port=port,
-        log_config=logging_config,
+        workers=workers,
+        timeout_keep_alive=timeout_keep_alive,
+        log_config=None,
+        **kwargs,
     )
+    server = CustomUvicornServer(config=config)
+
+    async def serve() -> None:
+        await server.serve()
+
+    try:
+        asyncio.run(serve())
+    except KeyboardInterrupt:
+        logger.info("Server stopped by user.")
+    finally:
+        logger.debug("Server shutdown complete.")
diff --git a/arcade/arcade/cli/utils.py b/arcade/arcade/cli/utils.py
index 61608d67..cdbf614a 100644
--- a/arcade/arcade/cli/utils.py
+++ b/arcade/arcade/cli/utils.py
@@ -249,18 +249,21 @@ def _format_evaluation(evaluation: "EvaluationResult") -> str:
         A formatted string representation of the evaluation details.
     """
     result_lines = []
-    for critic_result in evaluation.results:
-        match_color = "green" if critic_result["match"] else "red"
-        field = critic_result["field"]
-        score = critic_result["score"]
-        weight = critic_result["weight"]
-        expected = critic_result["expected"]
-        actual = critic_result["actual"]
-        result_lines.append(
-            f"[bold]{field}:[/bold] "
-            f"[{match_color}]Match: {critic_result['match']}, "
-            f"Score: {score:.2f}/{weight:.2f}[/{match_color}]"
-            f"\n    Expected: {expected}"
-            f"\n    Actual: {actual}"
-        )
+    if evaluation.failure_reason:
+        result_lines.append(f"[bold red]Failure Reason:[/bold red] {evaluation.failure_reason}")
+    else:
+        for critic_result in evaluation.results:
+            match_color = "green" if critic_result["match"] else "red"
+            field = critic_result["field"]
+            score = critic_result["score"]
+            weight = critic_result["weight"]
+            expected = critic_result["expected"]
+            actual = critic_result["actual"]
+            result_lines.append(
+                f"[bold]{field}:[/bold] "
+                f"[{match_color}]Match: {critic_result['match']}, "
+                f"Score: {score:.2f}/{weight:.2f}[/{match_color}]"
+                f"\n    Expected: {expected}"
+                f"\n    Actual: {actual}"
+            )
     return "\n".join(result_lines)
diff --git a/arcade/arcade/client/base.py b/arcade/arcade/client/base.py
index b63113b7..c88b1868 100644
--- a/arcade/arcade/client/base.py
+++ b/arcade/arcade/client/base.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Generic, TypeVar
 from urllib.parse import urljoin
 
@@ -13,19 +12,20 @@ from arcade.client.errors import (
     RateLimitError,
     UnauthorizedError,
 )
+from arcade.client.schema import OPENAI_API_VERSION
 
 T = TypeVar("T")
 ResponseT = TypeVar("ResponseT")
 
-API_VERSION = "v1"
-BASE_URL = "http://localhost:9099"
-
 
 class BaseResource(Generic[T]):
     """Base class for all resources."""
 
-    def __init__(self, client: T):
+    _path: str
+
+    def __init__(self, client: T) -> None:
         self._client = client
+        self._resource_path = self._client._base_url + self._path  # type: ignore[attr-defined]
 
 
 class BaseArcadeClient:
@@ -33,7 +33,7 @@ class BaseArcadeClient:
 
     def __init__(
         self,
-        base_url: str = BASE_URL,
+        base_url: str | None = None,
         api_key: str | None = None,
         headers: dict[str, str] | None = None,
         timeout: float | Timeout = 10.0,
@@ -49,8 +49,14 @@ class BaseArcadeClient:
             timeout: Request timeout in seconds.
             retries: Number of retries for failed requests.
         """
+        if base_url is None or api_key is None:
+            from arcade.core.config import config
+
+            base_url = base_url or config.engine_url
+            api_key = api_key or config.api.key
         self._base_url = base_url
-        self._api_key = api_key or os.environ.get("ARCADE_API_KEY")
+        self._api_key = api_key
+
         self._headers = headers or {}
         self._headers.setdefault("Authorization", f"Bearer {self._api_key}")
         self._headers.setdefault("Content-Type", "application/json")
@@ -65,8 +71,8 @@ class BaseArcadeClient:
 
     def _chat_url(self, base_url: str) -> str:
         chat_url = str(base_url)
-        if not base_url.endswith(API_VERSION):
-            chat_url = f"{base_url}/{API_VERSION}"
+        if not base_url.endswith(OPENAI_API_VERSION):
+            chat_url = f"{base_url}/{OPENAI_API_VERSION}"
         return chat_url
 
     def _handle_http_error(self, e: httpx.HTTPStatusError) -> None:
@@ -80,7 +86,10 @@ class BaseArcadeClient:
         }
         status_code = e.response.status_code
         error_class = error_map.get(status_code, InternalServerError)
-        raise error_class(str(e), response=e.response)
+        msg = e.response.json()
+        if isinstance(msg, dict) and "error" in msg:
+            raise error_class(msg["error"], response=e.response) from None
+        raise error_class(msg, response=e.response) from None
 
 
 class SyncArcadeClient(BaseArcadeClient):
@@ -94,7 +103,7 @@ class SyncArcadeClient(BaseArcadeClient):
             timeout=self._timeout,
         )
 
-    def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
+    def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:  # type: ignore[return]
         """
         Make a synchronous HTTP request.
         """
@@ -104,10 +113,9 @@ class SyncArcadeClient(BaseArcadeClient):
                 response = self._client.request(method, url, **kwargs)
                 response.raise_for_status()
                 return response  # noqa: TRY300
-            except httpx.HTTPStatusError:
+            except httpx.HTTPStatusError as e:
                 if attempt == self._retries - 1:
-                    raise
-        raise RuntimeError("This should never be reached")
+                    self._handle_http_error(e)
 
     def close(self) -> None:
         """Close the client session."""
@@ -139,7 +147,7 @@ class AsyncArcadeClient(BaseArcadeClient):
             )
         return self._client
 
-    async def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
+    async def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:  # type: ignore[return]
         """
         Make an asynchronous HTTP request.
         """
@@ -150,10 +158,9 @@ class AsyncArcadeClient(BaseArcadeClient):
                 response = await client.request(method, url, **kwargs)
                 response.raise_for_status()
                 return response  # noqa: TRY300
-            except httpx.HTTPStatusError:
+            except httpx.HTTPStatusError as e:
                 if attempt == self._retries - 1:
-                    raise
-        raise RuntimeError("This should never be reached")
+                    self._handle_http_error(e)
 
     async def close(self) -> None:
         """Close the client session."""
diff --git a/arcade/arcade/client/client.py b/arcade/arcade/client/client.py
index 346a3253..499992b3 100644
--- a/arcade/arcade/client/client.py
+++ b/arcade/arcade/client/client.py
@@ -1,11 +1,9 @@
 from typing import Any, TypeVar, Union
 
-import httpx
 from openai import AsyncOpenAI, OpenAI
 from openai.resources.chat import AsyncChat, Chat
 
 from arcade.client.base import (
-    API_VERSION,
     AsyncArcadeClient,
     BaseResource,
     SyncArcadeClient,
@@ -27,7 +25,7 @@ ClientT = TypeVar("ClientT", SyncArcadeClient, AsyncArcadeClient)
 class AuthResource(BaseResource[ClientT]):
     """Authentication resource."""
 
-    _base_path = f"/{API_VERSION}/auth"
+    _path = "/auth"
 
     def authorize(
         self,
@@ -59,7 +57,7 @@ class AuthResource(BaseResource[ClientT]):
 
         data = self._client._execute_request(  # type: ignore[attr-defined]
             "POST",
-            f"{self._base_path}/authorize",
+            f"{self._resource_path}/authorize",
             json=body,
         )
         return AuthResponse(**data)
@@ -85,7 +83,7 @@ class AuthResource(BaseResource[ClientT]):
 
         data = self._client._execute_request(  # type: ignore[attr-defined]
             "GET",
-            f"{self._base_path}/status",
+            f"{self._resource_path}/status",
             params={"authorizationId": auth_id, "scopes": " ".join(scopes) if scopes else None},
         )
         return AuthResponse(**data)
@@ -94,7 +92,7 @@ class AuthResource(BaseResource[ClientT]):
 class ToolResource(BaseResource[ClientT]):
     """Tool resource."""
 
-    _base_path = f"/{API_VERSION}/tool"
+    _path = "/tools"
 
     def run(
         self,
@@ -119,7 +117,7 @@ class ToolResource(BaseResource[ClientT]):
             "inputs": inputs,
         }
         data = self._client._execute_request(  # type: ignore[attr-defined]
-            "POST", f"{self._base_path}/execute", json=request_data
+            "POST", f"{self._resource_path}/execute", json=request_data
         )
         return ExecuteToolResponse(**data)
 
@@ -129,19 +127,21 @@ class ToolResource(BaseResource[ClientT]):
         """
         data = self._client._execute_request(  # type: ignore[attr-defined]
             "GET",
-            f"{self._base_path}/definition",
+            f"{self._resource_path}/definition",
             params={"directorId": director_id, "toolId": tool_id},
         )
         return ToolDefinition(**data)
 
-    def authorize(self, tool_name: str, user_id: str) -> AuthResponse:
+    def authorize(
+        self, tool_name: str, user_id: str, tool_version: str | None = None
+    ) -> AuthResponse:
         """
         Get the authorization status for a tool.
         """
         data = self._client._execute_request(  # type: ignore[attr-defined]
             "POST",
-            f"{self._base_path}/authorize",
-            json={"tool_name": tool_name, "user_id": user_id},
+            f"{self._resource_path}/authorize",
+            json={"tool_name": tool_name, "tool_version": tool_version, "user_id": user_id},
         )
         return AuthResponse(**data)
 
@@ -149,6 +149,8 @@ class ToolResource(BaseResource[ClientT]):
 class HealthResource(BaseResource[ClientT]):
     """Health check resource."""
 
+    _path = "/health"
+
     def check(self) -> None:
         """
         Check the health of the Arcade Engine.
@@ -158,7 +160,7 @@ class HealthResource(BaseResource[ClientT]):
         try:
             data = self._client._execute_request(  # type: ignore[attr-defined]
                 "GET",
-                f"/{API_VERSION}/health",
+                f"{self._resource_path}",
                 timeout=5,
             )
 
@@ -184,7 +186,7 @@ class HealthResource(BaseResource[ClientT]):
 class AsyncAuthResource(BaseResource[AsyncArcadeClient]):
     """Asynchronous Authentication resource."""
 
-    _base_path = f"/{API_VERSION}/auth"
+    _path = "/auth"
 
     async def authorize(
         self,
@@ -210,7 +212,7 @@ class AsyncAuthResource(BaseResource[AsyncArcadeClient]):
 
         data = await self._client._execute_request(  # type: ignore[attr-defined]
             "POST",
-            f"{self._base_path}/authorize",
+            f"{self._resource_path}/authorize",
             json=body,
         )
         return AuthResponse(**data)
@@ -236,7 +238,7 @@ class AsyncAuthResource(BaseResource[AsyncArcadeClient]):
 
         data = await self._client._execute_request(  # type: ignore[attr-defined]
             "GET",
-            f"{self._base_path}/status",
+            f"{self._resource_path}/status",
             params={"authorizationId": auth_id, "scopes": " ".join(scopes) if scopes else None},
         )
         return AuthResponse(**data)
@@ -245,7 +247,7 @@ class AsyncAuthResource(BaseResource[AsyncArcadeClient]):
 class AsyncToolResource(BaseResource[AsyncArcadeClient]):
     """Asynchronous Tool resource."""
 
-    _base_path = f"/{API_VERSION}/tools"
+    _path = "/tools"
 
     async def run(
         self,
@@ -264,7 +266,7 @@ class AsyncToolResource(BaseResource[AsyncArcadeClient]):
             "inputs": inputs,
         }
         data = await self._client._execute_request(  # type: ignore[attr-defined]
-            "POST", f"{self._base_path}/execute", json=request_data
+            "POST", f"{self._resource_path}/execute", json=request_data
         )
         return ExecuteToolResponse(**data)
 
@@ -274,19 +276,21 @@ class AsyncToolResource(BaseResource[AsyncArcadeClient]):
         """
         data = await self._client._execute_request(  # type: ignore[attr-defined]
             "GET",
-            f"{self._base_path}/definition",
+            f"{self._resource_path}/definition",
             params={"directorId": director_id, "toolId": tool_id},
         )
         return ToolDefinition(**data)
 
-    async def authorize(self, tool_name: str, user_id: str) -> AuthResponse:
+    async def authorize(
+        self, tool_name: str, user_id: str, tool_version: str | None = None
+    ) -> AuthResponse:
         """
         Get the authorization status for a tool.
         """
         data = await self._client._execute_request(  # type: ignore[attr-defined]
             "POST",
-            f"{self._base_path}/authorize",
-            json={"tool_name": tool_name, "user_id": user_id},
+            f"{self._resource_path}/authorize",
+            json={"tool_name": tool_name, "tool_version": tool_version, "user_id": user_id},
         )
         return AuthResponse(**data)
 
@@ -294,6 +298,8 @@ class AsyncToolResource(BaseResource[AsyncArcadeClient]):
 class AsyncHealthResource(BaseResource[AsyncArcadeClient]):
     """Asynchronous Health check resource."""
 
+    _path = "/health"
+
     async def check(self) -> None:
         """
         Check the health of the Arcade Engine.
@@ -303,7 +309,7 @@ class AsyncHealthResource(BaseResource[AsyncArcadeClient]):
         try:
             data = await self._client._execute_request(  # type: ignore[attr-defined]
                 "GET",
-                f"/{API_VERSION}/health",
+                f"{self._resource_path}",
                 timeout=5,
             )
 
@@ -332,7 +338,7 @@ class Arcade(SyncArcadeClient):
     def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
         self.auth: AuthResource = AuthResource(self)
-        self.tool: ToolResource = ToolResource(self)
+        self.tools: ToolResource = ToolResource(self)
         self.health: HealthResource = HealthResource(self)
         chat_url = self._chat_url(self._base_url)
         self._openai_client = OpenAI(base_url=chat_url, api_key=self._api_key)
@@ -345,11 +351,8 @@ class Arcade(SyncArcadeClient):
         """
         Execute a synchronous request.
         """
-        try:
-            response = self._request(method, url, **kwargs)
-            return response.json()
-        except httpx.HTTPStatusError as e:
-            self._handle_http_error(e)
+        response = self._request(method, url, **kwargs)
+        return response.json()
 
 
 class AsyncArcade(AsyncArcadeClient):
@@ -358,7 +361,7 @@ class AsyncArcade(AsyncArcadeClient):
     def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
         self.auth: AsyncAuthResource = AsyncAuthResource(self)
-        self.tool: AsyncToolResource = AsyncToolResource(self)
+        self.tools: AsyncToolResource = AsyncToolResource(self)
         self.health: AsyncHealthResource = AsyncHealthResource(self)
         chat_url = self._chat_url(self._base_url)
         self._openai_client = AsyncOpenAI(base_url=chat_url, api_key=self._api_key)
@@ -371,8 +374,5 @@ class AsyncArcade(AsyncArcadeClient):
         """
         Execute an asynchronous request.
         """
-        try:
-            response = await self._request(method, url, **kwargs)
-            return response.json()
-        except httpx.HTTPStatusError as e:
-            self._handle_http_error(e)
+        response = await self._request(method, url, **kwargs)
+        return response.json()
diff --git a/arcade/arcade/client/schema.py b/arcade/arcade/client/schema.py
index b8a48091..627c44ef 100644
--- a/arcade/arcade/client/schema.py
+++ b/arcade/arcade/client/schema.py
@@ -1,9 +1,12 @@
+import os
 from enum import Enum
 
 from pydantic import AnyUrl, BaseModel, Field
 
 from arcade.core.schema import ToolAuthorizationContext, ToolCallOutput
 
+OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION", "v1")
+
 
 class AuthProvider(str, Enum):
     """The supported authorization providers."""
diff --git a/arcade/arcade/core/catalog.py b/arcade/arcade/core/catalog.py
index df377ac3..ba9fd566 100644
--- a/arcade/arcade/core/catalog.py
+++ b/arcade/arcade/core/catalog.py
@@ -48,8 +48,6 @@ from arcade.core.utils import (
 from arcade.sdk.annotations import Inferrable
 from arcade.sdk.auth import BaseOAuth2, ToolAuthorization
 
-DEFAULT_TOOLKIT_NAME = "Tools"
-
 InnerWireType = Literal["string", "integer", "number", "boolean", "json"]
 WireType = Union[InnerWireType, Literal["array"]]
 
@@ -116,7 +114,7 @@ class ToolCatalog(BaseModel):
     def add_tool(
         self,
         tool_func: Callable,
-        toolkit_or_name: Union[str | None, Toolkit] = None,
+        toolkit_or_name: Union[str, Toolkit],
         module: ModuleType | None = None,
     ) -> None:
         """
@@ -131,9 +129,6 @@ class ToolCatalog(BaseModel):
         elif isinstance(toolkit_or_name, str):
             toolkit = None
             toolkit_name = toolkit_or_name
-        else:
-            toolkit = None
-            toolkit_name = DEFAULT_TOOLKIT_NAME
 
         if not toolkit_name:
             raise ValueError("A toolkit name or toolkit must be provided.")
@@ -163,6 +158,13 @@ class ToolCatalog(BaseModel):
             output_model=output_model,
         )
 
+    def add_module(self, module: ModuleType) -> None:
+        """
+        Add all the tools in a module to the catalog.
+        """
+        toolkit = Toolkit.from_module(module)
+        self.add_toolkit(toolkit)
+
     def add_toolkit(self, toolkit: Toolkit) -> None:
         """
         Add the tools from a loaded toolkit to the catalog.
@@ -201,6 +203,15 @@ class ToolCatalog(BaseModel):
     def get_tool_names(self) -> list[FullyQualifiedName]:
         return [tool.definition.get_fully_qualified_name() for tool in self._tools.values()]
 
+    def find_tool_by_func(self, func: Callable) -> ToolDefinition:
+        """
+        Find a tool by its function.
+        """
+        for _, tool in self._tools.items():
+            if tool.tool == func:
+                return tool.definition
+        raise ValueError(f"Tool {func} not found in the catalog.")
+
     def get_tool(self, name: FullyQualifiedName) -> MaterializedTool:
         """
         Get a tool from the catalog by fully-qualified name and version.
diff --git a/arcade/arcade/core/config_model.py b/arcade/arcade/core/config_model.py
index 16d749ff..75e558f0 100644
--- a/arcade/arcade/core/config_model.py
+++ b/arcade/arcade/core/config_model.py
@@ -1,16 +1,19 @@
 import ipaddress
+import os
 from pathlib import Path
 from typing import Any
 from urllib.parse import urlparse
 
 import idna
 import toml
-from pydantic import BaseModel, ValidationError
-
-from arcade.core.env import settings
+from pydantic import BaseModel, ConfigDict, ValidationError
 
 
-class ApiConfig(BaseModel):
+class BaseConfig(BaseModel):
+    model_config = ConfigDict(extra="ignore")
+
+
+class ApiConfig(BaseConfig):
     """
     Arcade API configuration.
     """
@@ -19,9 +22,13 @@ class ApiConfig(BaseModel):
     """
     Arcade API key.
     """
+    version: str = "v1"
+    """
+    Arcade API version.
+    """
 
 
-class UserConfig(BaseModel):
+class UserConfig(BaseConfig):
     """
     Arcade user configuration.
     """
@@ -32,7 +39,7 @@ class UserConfig(BaseModel):
     """
 
 
-class EngineConfig(BaseModel):
+class EngineConfig(BaseConfig):
     """
     Arcade Engine configuration.
     """
@@ -51,7 +58,7 @@ class EngineConfig(BaseModel):
     """
 
 
-class Config(BaseModel):
+class Config(BaseConfig):
     """
     Configuration for Arcade.
     """
@@ -79,7 +86,8 @@ class Config(BaseModel):
         """
         Get the path to the Arcade configuration directory.
         """
-        return settings.WORK_DIR if settings.WORK_DIR else Path.home() / ".arcade"
+        config_path = os.getenv("ARCADE_WORK_DIR") or Path.home() / ".arcade"
+        return Path(config_path).resolve()
 
     @classmethod
     def get_config_file_path(cls) -> Path:
@@ -167,14 +175,14 @@ class Config(BaseModel):
         if ":" in parsed_host.netloc and not is_ip:
             host, existing_port = parsed_host.netloc.rsplit(":", 1)
             if existing_port.isdigit():
-                return f"{protocol}://{parsed_host.netloc}/v1"
+                return f"{protocol}://{parsed_host.netloc}/{self.api.version}"
 
         if is_fqdn and self.engine.port is None:
-            return f"{protocol}://{encoded_host}/v1"
+            return f"{protocol}://{encoded_host}/{self.api.version}"
         elif self.engine.port is not None:
-            return f"{protocol}://{encoded_host}:{self.engine.port}/v1"
+            return f"{protocol}://{encoded_host}:{self.engine.port}/{self.api.version}"
         else:
-            return f"{protocol}://{encoded_host}/v1"
+            return f"{protocol}://{encoded_host}/{self.api.version}"
 
     @classmethod
     def ensure_config_dir_exists(cls) -> None:
diff --git a/arcade/arcade/core/env.py b/arcade/arcade/core/env.py
deleted file mode 100644
index 3dcbe063..00000000
--- a/arcade/arcade/core/env.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from functools import lru_cache
-from pathlib import Path
-
-from pydantic_settings import BaseSettings, SettingsConfigDict
-
-
-class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env")
-
-    WORK_DIR: Path = Path.home() / ".arcade"
-
-
-@lru_cache
-def get_settings() -> Settings:
-    # env_file = os.getenv("ARCADE_ENV_FILE")
-    # TODO allow env override
-    return Settings()
-
-
-settings = get_settings()
diff --git a/arcade/arcade/core/schema.py b/arcade/arcade/core/schema.py
index fc085926..64455664 100644
--- a/arcade/arcade/core/schema.py
+++ b/arcade/arcade/core/schema.py
@@ -1,9 +1,11 @@
+import os
 from dataclasses import dataclass
 from typing import Any, Literal, Optional, Union
 
 from pydantic import AnyUrl, BaseModel, Field
 
-TOOL_NAME_SEPARATOR = "."
+# allow for custom tool name separator
+TOOL_NAME_SEPARATOR = os.getenv("ARCADE_TOOL_NAME_SEPARATOR", ".")
 
 
 class ValueSchema(BaseModel):
diff --git a/arcade/arcade/core/toolkit.py b/arcade/arcade/core/toolkit.py
index f8ba3948..0a6b58ed 100644
--- a/arcade/arcade/core/toolkit.py
+++ b/arcade/arcade/core/toolkit.py
@@ -108,14 +108,19 @@ class Toolkit(BaseModel):
     @classmethod
     def find_all_arcade_toolkits(cls) -> list["Toolkit"]:
         """
-        Find all installed packages prefixed with 'arcade_' and load them as Toolkits.
+        Find all installed packages prefixed with 'arcade_' in the current
+        Python interpreter's environment and load them as Toolkits.
 
         Returns:
             List[Toolkit]: A list of Toolkit instances.
         """
+        import sysconfig
+
+        # Get the site-packages directory of the current interpreter
+        site_packages_dir = sysconfig.get_paths()["purelib"]
         arcade_packages = [
             dist.metadata["Name"]
-            for dist in importlib.metadata.distributions()
+            for dist in importlib.metadata.distributions(path=[site_packages_dir])
             if dist.metadata["Name"].startswith("arcade_")
         ]
         return [cls.from_package(package) for package in arcade_packages]
diff --git a/arcade/arcade/sdk/__init__.py b/arcade/arcade/sdk/__init__.py
index f9ddc08d..26c0fc27 100644
--- a/arcade/arcade/sdk/__init__.py
+++ b/arcade/arcade/sdk/__init__.py
@@ -1,21 +1,5 @@
-from .eval import (
-    BinaryCritic,
-    EvalRubric,
-    EvalSuite,
-    ExpectedToolCall,
-    NumericCritic,
-    SimilarityCritic,
-    tool_eval,
-)
 from .tool import tool
 
 __all__ = [
     "tool",
-    "EvalRubric",
-    "EvalSuite",
-    "ExpectedToolCall",
-    "tool_eval",
-    "BinaryCritic",
-    "SimilarityCritic",
-    "NumericCritic",
 ]
diff --git a/arcade/arcade/sdk/eval/eval.py b/arcade/arcade/sdk/eval/eval.py
index 5ddf0756..167c34c2 100644
--- a/arcade/arcade/sdk/eval/eval.py
+++ b/arcade/arcade/sdk/eval/eval.py
@@ -4,6 +4,7 @@ import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Callable
 
+from arcade.core.config_model import Config
 from arcade.core.schema import FullyQualifiedName
 
 try:
@@ -11,9 +12,10 @@ try:
     from scipy.optimize import linear_sum_assignment
 except ImportError:
     raise ImportError(
-        "Use `pip install arcade[evals]` to install the required dependencies for evaluation."
+        "Use `pip install arcade-ai[evals]` to install the required dependencies for evaluation."
     )
 
+
 from arcade.client.client import Arcade, AsyncArcade
 from arcade.sdk.error import WeightError
 
@@ -69,12 +71,15 @@ class EvaluationResult:
         passed: Whether the evaluation passed based on the fail_threshold.
         warning: Whether the evaluation issued a warning based on the warn_threshold.
         results: A list of dictionaries containing the results for each critic.
+        failure_reason: If the evaluation failed completely due to settings in the rubric,
+                        this field contains the reason for failure.
     """
 
     score: float = 0.0
     passed: bool = False
     warning: bool = False
     results: list[dict[str, Any]] = field(default_factory=list)
+    failure_reason: str | None = None
 
     @property
     def fail(self) -> bool:
@@ -120,10 +125,10 @@ class EvaluationResult:
         Returns:
             The score for the tool selection.
         """
-        score = weight if expected == actual else 0.0
+        score = weight if compare_tool_name(expected, actual) else 0.0
         self.add(
             "tool_selection",
-            {"match": expected == actual, "score": score},
+            {"match": compare_tool_name(expected, actual), "score": score},
             weight,
             expected,
             actual,
@@ -190,7 +195,10 @@ class EvalCase:
             True if tool selection failure should occur, False otherwise.
         """
         expected_tools = [tc.name for tc in self.expected_tool_calls]
-        return self.rubric.fail_on_tool_selection and set(expected_tools) != set(actual_tools)
+        return self.rubric.fail_on_tool_selection and not all(
+            compare_tool_name(expected, actual)
+            for expected, actual in zip(expected_tools, actual_tools)
+        )
 
     def check_tool_call_quantity_failure(self, actual_count: int) -> bool:
         """
@@ -218,17 +226,30 @@ class EvalCase:
         evaluation_result = EvaluationResult()
         actual_tools = [tool for tool, _ in actual_tool_calls]
 
-        if self.check_tool_selection_failure(actual_tools):
-            evaluation_result.score = 0.0
-            evaluation_result.passed = False
-            evaluation_result.warning = False
-            return evaluation_result
-
         actual_count = len(actual_tool_calls)
         if self.check_tool_call_quantity_failure(actual_count):
             evaluation_result.score = 0.0
             evaluation_result.passed = False
             evaluation_result.warning = False
+            expected_count = len(self.expected_tool_calls)
+            evaluation_result.failure_reason = (
+                f"Expected {expected_count} tool call(s), but got {actual_count}"
+            )
+            return evaluation_result
+
+        # check if no tools should be called and none were called
+        if not self.expected_tool_calls and not actual_tools:
+            evaluation_result.score = 1.0
+            evaluation_result.passed = True
+            evaluation_result.warning = False
+            return evaluation_result
+
+        if self.check_tool_selection_failure(actual_tools):
+            evaluation_result.score = 0.0
+            evaluation_result.passed = False
+            evaluation_result.warning = False
+            expected_tools = [tc.name for tc in self.expected_tool_calls]
+            evaluation_result.failure_reason = f"Tool selection mismatch. Expected tools: {expected_tools}, but got: {actual_tools}"
             return evaluation_result
 
         # Create a cost matrix for the assignment problem
@@ -422,12 +443,10 @@ class EvalSuite:
     max_concurrent: int = 1  # Default to sequential execution
     _client: AsyncArcade | Arcade | None = None
 
-    def initialize_client(self) -> None:
+    def initialize_client(self, config: Config) -> None:
         """
         Initialize the client instance for the EvalSuite.
         """
-        from arcade.core.config import config
-
         if self.max_concurrent > 1:
             self._client = AsyncArcade(
                 api_key=config.api.key,
@@ -443,7 +462,7 @@ class EvalSuite:
         self,
         name: str,
         user_message: str,
-        expected_tool_calls: list[ExpectedToolCall],
+        expected_tool_calls: list[tuple[Callable, dict[str, Any]]],
         critics: list["Critic"],
         system_message: str | None = None,
         rubric: EvalRubric | None = None,
@@ -461,11 +480,18 @@ class EvalSuite:
             rubric: The evaluation rubric for this case.
             additional_messages: Optional list of additional messages for context.
         """
+        expected = [
+            ExpectedToolCall(
+                name=str(self.catalog.find_tool_by_func(func).get_fully_qualified_name()),
+                args=args,
+            )
+            for func, args in expected_tool_calls
+        ]
         case = EvalCase(
             name=name,
             system_message=system_message or self.system_message,
             user_message=user_message,
-            expected_tool_calls=expected_tool_calls,
+            expected_tool_calls=expected,
             rubric=rubric or self.rubric,
             critics=critics,
             additional_messages=additional_messages or [],
@@ -477,7 +503,7 @@ class EvalSuite:
         name: str,
         user_message: str,
         system_message: str | None = None,
-        expected_tool_calls: list[ExpectedToolCall] | None = None,
+        expected_tool_calls: list[tuple[Callable, dict[str, Any]]] | None = None,
         rubric: EvalRubric | None = None,
         critics: list["Critic"] | None = None,
         additional_messages: list[dict[str, str]] | None = None,
@@ -507,12 +533,22 @@ class EvalSuite:
         if additional_messages:
             new_additional_messages.extend(additional_messages)
 
+        expected = last_case.expected_tool_calls
+        if expected_tool_calls:
+            expected = [
+                ExpectedToolCall(
+                    name=str(self.catalog.find_tool_by_func(func).get_fully_qualified_name()),
+                    args=args,
+                )
+                for func, args in expected_tool_calls
+            ]
+
         # Create a new case, copying from the last one and updating fields
         new_case = EvalCase(
             name=name,
             system_message=system_message or last_case.system_message,
             user_message=user_message,
-            expected_tool_calls=expected_tool_calls or last_case.expected_tool_calls,
+            expected_tool_calls=expected,
             rubric=rubric or self.rubric,
             critics=critics or last_case.critics.copy(),
             additional_messages=new_additional_messages,
@@ -570,7 +606,7 @@ class EvalSuite:
 
         return results
 
-    def run(self, model: str) -> dict[str, Any]:
+    def run(self, config: Config, model: str) -> dict[str, Any]:
         """
         Run the evaluation suite.
 
@@ -581,7 +617,7 @@ class EvalSuite:
             A dictionary containing the evaluation results.
         """
         if not self._client:
-            self.initialize_client()
+            self.initialize_client(config)
 
         if self.max_concurrent > 1:
             # Run asynchronously with concurrency
@@ -614,10 +650,26 @@ def get_tool_args(chat_completion: Any) -> list[tuple[str, dict[str, Any]]]:
     return tool_args_list
 
 
+def compare_tool_name(expected: str, actual: str) -> bool:
+    """
+    Compare the tool name without penalizing for mismatch in separators
+    between module names and tool names ex. '-' vs '_' vs '.' vs ' '
+    """
+    # TODO optimize this
+    # Remove all separators from both names
+    separators = "-_."
+    expected_clean = "".join(char for char in expected if char not in separators)
+    actual_clean = "".join(char for char in actual if char not in separators)
+
+    # Compare the cleaned names
+    return expected_clean == actual_clean
+
+
 def tool_eval() -> Callable[[Callable], Callable]:
     def decorator(func: Callable) -> Callable:
         @functools.wraps(func)
         def wrapper(
+            config: Config,
             models: list[str],
             max_concurrency: int = 1,
         ) -> list[dict[str, Any]]:
@@ -627,7 +679,7 @@ def tool_eval() -> Callable[[Callable], Callable]:
             suite.max_concurrent = max_concurrency
             results = []
             for model in models:
-                result = suite.run(model)
+                result = suite.run(config, model)
                 results.append(result)
             return results
 
diff --git a/arcade/pyproject.toml b/arcade/pyproject.toml
index 58524177..eb1acd1c 100644
--- a/arcade/pyproject.toml
+++ b/arcade/pyproject.toml
@@ -15,14 +15,13 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
 pydantic = "^2.7.0"
-pydantic-settings = "^2.2.1"
 typer = "^0.9.0"
 rich = "^13.7.1"
 toml = "^0.10.2"
 tomlkit = "^0.12.4"
-requests = "^2.26.0" # TODO: is this really needed?
 openai = "^1.36.0" # TODO: relax to an earlier version that still has what we need
 pyjwt = "^2.8.0"
+loguru = "^0.7.0"
 
 
 [tool.poetry.group.fastapi.dependencies]
@@ -115,7 +114,9 @@ ignore = [ # TODO work to remove these
     # raise from (cli specific)
     "B904",
     # long message exceptions
-    "TRY003"
+    "TRY003",
+    # subprocess.Popen
+    "S603",
 ]
 
 [tool.ruff.format]
diff --git a/arcade/tests/client/test_client.py b/arcade/tests/client/test_client.py
index 3b8d9f21..faa7e6af 100644
--- a/arcade/tests/client/test_client.py
+++ b/arcade/tests/client/test_client.py
@@ -68,6 +68,18 @@ HEALTH_CHECK_UNHEALTHY_RESPONSE_DATA = {
 }
 
 
+@pytest.fixture
+def test_sync_client():
+    """Test client."""
+    return Arcade(base_url="http://arcade.example.com", api_key="fake_api_key")
+
+
+@pytest.fixture
+def test_async_client():
+    """Test client."""
+    return AsyncArcade(base_url="http://arcade.example.com", api_key="fake_api_key")
+
+
 @pytest.fixture
 def mock_response():
     """Mock Response object for testing."""
@@ -94,7 +106,7 @@ def mock_async_response():
         (500, InternalServerError),
     ],
 )
-def test_handle_http_error(error_code, expected_error, mock_response):
+def test_handle_http_error(test_sync_client, error_code, expected_error, mock_response):
     """Test _handle_http_error method for different error codes."""
     mock_response.status_code = error_code
     mock_response.json.return_value = {"error": "Test error message"}
@@ -103,16 +115,14 @@ def test_handle_http_error(error_code, expected_error, mock_response):
     mock_http_error = Mock(spec=HTTPStatusError)
     mock_http_error.response = mock_response
 
-    client = Arcade(api_key="fake_api_key")  # Create an instance of Arcade
     with pytest.raises(expected_error):
-        client._handle_http_error(mock_http_error)  # Call the method on the instance
+        test_sync_client._handle_http_error(mock_http_error)  # Call the method on the instance
 
 
-def test_arcade_auth_authorize(mock_response, monkeypatch):
+def test_arcade_auth_authorize(test_sync_client, mock_response, monkeypatch):
     """Test Arcade.auth.authorize method."""
     monkeypatch.setattr(Arcade, "_execute_request", lambda *args, **kwargs: AUTH_RESPONSE_DATA)
-    client = Arcade(api_key="fake_api_key")
-    auth_response = client.auth.authorize(
+    auth_response = test_sync_client.auth.authorize(
         provider=AuthProvider.google,
         scopes=["https://www.googleapis.com/auth/gmail.readonly"],
         user_id="sam@arcade-ai.com",
@@ -120,19 +130,17 @@ def test_arcade_auth_authorize(mock_response, monkeypatch):
     assert auth_response == AuthResponse(**AUTH_RESPONSE_DATA)
 
 
-def test_arcade_auth_poll_authorization(mock_response, monkeypatch):
+def test_arcade_auth_poll_authorization(test_sync_client, mock_response, monkeypatch):
     """Test Arcade.auth.poll_authorization method."""
     monkeypatch.setattr(Arcade, "_execute_request", lambda *args, **kwargs: AUTH_RESPONSE_DATA)
-    client = Arcade(api_key="fake_api_key")
-    auth_response = client.auth.status("auth_123")
+    auth_response = test_sync_client.auth.status("auth_123")
     assert auth_response == AuthResponse(**AUTH_RESPONSE_DATA)
 
 
-def test_arcade_tool_run(mock_response, monkeypatch):
-    """Test Arcade.tool.run method."""
+def test_arcade_tool_run(test_sync_client, mock_response, monkeypatch):
+    """Test Arcade.tools.run method."""
     monkeypatch.setattr(Arcade, "_execute_request", lambda *args, **kwargs: TOOL_RESPONSE_DATA)
-    client = Arcade(api_key="fake_api_key")
-    tool_response = client.tool.run(
+    tool_response = test_sync_client.tools.run(
         tool_name="GetEmails",
         user_id="sam@arcade-ai.com",
         tool_version="0.1.0",
@@ -141,54 +149,51 @@ def test_arcade_tool_run(mock_response, monkeypatch):
     assert tool_response == ExecuteToolResponse(**TOOL_RESPONSE_DATA)
 
 
-def test_arcade_tool_get(mock_response, monkeypatch):
-    """Test Arcade.tool.get method."""
+def test_arcade_tool_get(test_sync_client, mock_response, monkeypatch):
+    """Test Arcade.tools.get method."""
     monkeypatch.setattr(Arcade, "_execute_request", lambda *args, **kwargs: TOOL_DEFINITION_DATA)
-    client = Arcade(api_key="fake_api_key")
-    tool_definition = client.tool.get(director_id="default", tool_id="GetEmails")
+    tool_definition = test_sync_client.tools.get(director_id="default", tool_id="GetEmails")
     assert tool_definition == ToolDefinition(**TOOL_DEFINITION_DATA)
 
 
-def test_arcade_tool_authorize(mock_response, monkeypatch):
-    """Test Arcade.tool.authorize method."""
+def test_arcade_tool_authorize(test_sync_client, mock_response, monkeypatch):
+    """Test Arcade.tools.authorize method."""
     monkeypatch.setattr(
         Arcade, "_execute_request", lambda *args, **kwargs: TOOL_AUTHORIZE_RESPONSE_DATA
     )
-    client = Arcade(api_key="fake_api_key")
-    auth_response = client.tool.authorize(tool_name="GetEmails", user_id="sam@arcade-ai.com")
+    auth_response = test_sync_client.tools.authorize(
+        tool_name="GetEmails", user_id="sam@arcade-ai.com"
+    )
     assert auth_response == AuthResponse(**TOOL_AUTHORIZE_RESPONSE_DATA)
 
 
-def test_arcade_health_check(mock_response, monkeypatch):
+def test_arcade_health_check(test_sync_client, mock_response, monkeypatch):
     """Test Arcade.health.check method."""
     monkeypatch.setattr(
         Arcade, "_execute_request", lambda *args, **kwargs: HEALTH_CHECK_HEALTHY_RESPONSE_DATA
     )
-    client = Arcade(api_key="fake_api_key")
-    client.health.check()
+    test_sync_client.health.check()
     assert True  # If no exception is raised, the test passes
 
 
-def test_arcade_health_check_raises_error(mock_response, monkeypatch):
+def test_arcade_health_check_raises_error(test_sync_client, mock_response, monkeypatch):
     """Test Arcade.health.check method."""
     monkeypatch.setattr(
         Arcade, "_execute_request", lambda *args, **kwargs: HEALTH_CHECK_UNHEALTHY_RESPONSE_DATA
     )
-    client = Arcade(api_key="fake_api_key")
     with pytest.raises(EngineNotHealthyError):
-        client.health.check()
+        test_sync_client.health.check()
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_auth_authorize(mock_async_response, monkeypatch):
+async def test_async_arcade_auth_authorize(test_async_client, mock_async_response, monkeypatch):
     """Test AsyncArcade.auth.authorize method."""
 
     async def mock_execute_request(*args, **kwargs):
         return AUTH_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    auth_response = await client.auth.authorize(
+    auth_response = await test_async_client.auth.authorize(
         provider=AuthProvider.google,
         scopes=["https://www.googleapis.com/auth/gmail.readonly"],
         user_id="sam@arcade-ai.com",
@@ -197,28 +202,28 @@ async def test_async_arcade_auth_authorize(mock_async_response, monkeypatch):
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_auth_poll_authorization(mock_async_response, monkeypatch):
+async def test_async_arcade_auth_poll_authorization(
+    test_async_client, mock_async_response, monkeypatch
+):
     """Test AsyncArcade.auth.poll_authorization method."""
 
     async def mock_execute_request(*args, **kwargs):
         return AUTH_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    auth_response = await client.auth.status("auth_123")
+    auth_response = await test_async_client.auth.status("auth_123")
     assert auth_response == AuthResponse(**AUTH_RESPONSE_DATA)
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_tool_run(mock_async_response, monkeypatch):
-    """Test AsyncArcade.tool.run method."""
+async def test_async_arcade_tool_run(test_async_client, mock_async_response, monkeypatch):
+    """Test AsyncArcade.tools.run method."""
 
     async def mock_execute_request(*args, **kwargs):
         return TOOL_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    tool_response = await client.tool.run(
+    tool_response = await test_async_client.tools.run(
         tool_name="GetEmails",
         user_id="sam@arcade-ai.com",
         tool_version="0.1.0",
@@ -228,52 +233,52 @@ async def test_async_arcade_tool_run(mock_async_response, monkeypatch):
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_tool_get(mock_async_response, monkeypatch):
-    """Test AsyncArcade.tool.get method."""
+async def test_async_arcade_tool_get(test_async_client, mock_async_response, monkeypatch):
+    """Test AsyncArcade.tools.get method."""
 
     async def mock_execute_request(*args, **kwargs):
         return TOOL_DEFINITION_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    tool_definition = await client.tool.get(director_id="default", tool_id="GetEmails")
+    tool_definition = await test_async_client.tools.get(director_id="default", tool_id="GetEmails")
     assert tool_definition == ToolDefinition(**TOOL_DEFINITION_DATA)
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_tool_authorize(mock_async_response, monkeypatch):
-    """Test AsyncArcade.tool.authorize method."""
+async def test_async_arcade_tool_authorize(test_async_client, mock_async_response, monkeypatch):
+    """Test AsyncArcade.tools.authorize method."""
 
     async def mock_execute_request(*args, **kwargs):
         return TOOL_AUTHORIZE_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    auth_response = await client.tool.authorize(tool_name="GetEmails", user_id="sam@arcade-ai.com")
+    auth_response = await test_async_client.tools.authorize(
+        tool_name="GetEmails", user_id="sam@arcade-ai.com"
+    )
     assert auth_response == AuthResponse(**TOOL_AUTHORIZE_RESPONSE_DATA)
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_health_check(mock_async_response, monkeypatch):
+async def test_async_arcade_health_check(test_async_client, mock_async_response, monkeypatch):
     """Test AsyncArcade.health.check method."""
 
     async def mock_execute_request(*args, **kwargs):
         return HEALTH_CHECK_HEALTHY_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
-    await client.health.check()
+    await test_async_client.health.check()
     assert True  # If no exception is raised, the test passes
 
 
 @pytest.mark.asyncio
-async def test_async_arcade_health_check_raises_error(mock_async_response, monkeypatch):
+async def test_async_arcade_health_check_raises_error(
+    test_async_client, mock_async_response, monkeypatch
+):
     """Test AsyncArcade.health.check method."""
 
     async def mock_execute_request(*args, **kwargs):
         return HEALTH_CHECK_UNHEALTHY_RESPONSE_DATA
 
     monkeypatch.setattr(AsyncArcade, "_execute_request", mock_execute_request)
-    client = AsyncArcade(api_key="fake_api_key")
     with pytest.raises(EngineNotHealthyError):
-        await client.health.check()
+        await test_async_client.health.check()
diff --git a/arcade/tests/core/test_catalog.py b/arcade/tests/core/test_catalog.py
index 8434bf1c..70288176 100644
--- a/arcade/tests/core/test_catalog.py
+++ b/arcade/tests/core/test_catalog.py
@@ -14,10 +14,10 @@ def sample_tool() -> str:
     return "Hello, world!"
 
 
-def test_add_tool_with_no_toolkit():
+def test_add_tool_with_empty_toolkit_name_raises():
     catalog = ToolCatalog()
-    catalog.add_tool(sample_tool)
-    assert catalog.get_tool(FullyQualifiedName("SampleTool", "Tools", None)).tool == sample_tool
+    with pytest.raises(ValueError):
+        catalog.add_tool(sample_tool, "")
 
 
 def test_add_tool_with_toolkit_name():
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a028cacd..2331d2ac 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -8,7 +8,7 @@ ARG HOST=0.0.0.0
 # Set environment variables using the build arguments
 ENV PORT=${PORT}
 ENV HOST=${HOST}
-ENV WORK_DIR=/app
+ENV ARCADE_WORK_DIR=/app
 
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
@@ -45,8 +45,8 @@ WORKDIR /app/toolkits
 # Install toolkits from the toolkits directory
 RUN set -e; \
     for toolkit in ./*; do \
-        echo "Installing toolkit $toolkit"; \
-        pip install $toolkit; \
+    echo "Installing toolkit $toolkit"; \
+    pip install $toolkit; \
     done
 
 
diff --git a/examples/langchain/gmail.py b/examples/langchain/gmail.py
deleted file mode 100644
index 5d4f4598..00000000
--- a/examples/langchain/gmail.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-
-from google.oauth2.credentials import Credentials
-from langchain_google_community import GmailToolkit
-from langchain_google_community.gmail.utils import (
-    build_resource_service,
-)
-from langchain_openai import ChatOpenAI
-from langgraph.prebuilt import create_react_agent
-
-# Step 1: Install required packages
-# Run the following in your terminal:
-# %pip install -qU langchain-google-community[gmail]
-# %pip install -qU langchain-openai
-# %pip install -qU langgraph
-#
-# Step 2: Set environment variables for LangChain and OpenAI API keys
-# Uncomment the following lines if you have the LangSmith API key
-# os.environ["LANGCHAIN_TRACING_V2"] = "true"
-# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")
-#
-# Step 3 (Option 1) Manually authenticate with Gmail by creating your own google app, credentials, and handling tokens and Oauth
-# credentials = get_gmail_credentials(
-#    token_file="token.json",
-#    scopes=["https://mail.google.com/"],
-#    client_secrets_file="credentials.json",
-# )
-#
-# ----------------- OR -----------------
-# Step 3 (Option 2) Use the Arcade SDK to authenticate with Gmail
-from arcade.client import Arcade, AuthProvider
-
-client = Arcade(api_key=os.environ["ARCADE_API_KEY"])
-
-challenge = client.auth.authorize(
-    provider=AuthProvider.google,
-    scopes=["https://www.googleapis.com/auth/gmail.readonly"],
-    user_id="example_user_id",
-)
-
-if challenge.status != "completed":
-    print(f"Please visit this URL to authorize: {challenge.auth_url}")
-    input("Press Enter after you've completed the authorization...")
-    challenge = client.auth.poll_authorization(challenge)
-    if challenge.status != "completed":
-        print("Authorization not completed. Please try again.")
-        exit(1)
-
-
-creds = Credentials(challenge.context.token)
-api_resource = build_resource_service(credentials=creds)
-toolkit = GmailToolkit(api_resource=api_resource)
-
-# Step 4: Get available tools
-tools = toolkit.get_tools()
-
-# Step 5: Initialize the LLM and create an agent
-llm = ChatOpenAI(model="gpt-4o")
-agent_executor = create_react_agent(llm, tools)
-
-# Step 6: Draft an email using the agent
-example_query = "Read my latest emails to me and summarize them."
-events = agent_executor.stream(
-    {"messages": [("user", example_query)]},
-    stream_mode="values",
-)
-for event in events:
-    event["messages"][-1].pretty_print()
diff --git a/examples/langchain/langgraph_auth.py b/examples/langchain/langgraph_auth.py
new file mode 100644
index 00000000..513e6114
--- /dev/null
+++ b/examples/langchain/langgraph_auth.py
@@ -0,0 +1,60 @@
+import time  # Import time for polling delays
+
+from google.oauth2.credentials import Credentials
+from langchain_google_community import GmailToolkit
+from langchain_google_community.gmail.utils import (
+    build_resource_service,
+)
+from langchain_openai import ChatOpenAI
+from langgraph.prebuilt import create_react_agent
+
+# Step 1: Install required packages
+# Run the following in your terminal:
+# %pip install -qU langchain-google-community[gmail]
+# %pip install -qU langchain-openai
+# %pip install -qU langgraph
+from arcade.client import Arcade, AuthProvider
+
+client = Arcade()
+
+# Start the authorization process for the tool "ListEmails"
+auth_response = client.auth.authorize(
+    provider=AuthProvider.google,
+    scopes=["https://www.googleapis.com/auth/gmail.readonly"],
+    user_id="sam@arcade-ai.com",
+)
+
+# If authorization is not completed, prompt the user and poll for status
+if auth_response.status != "completed":
+    print(
+        "Please complete the authorization challenge in your browser before continuing:"
+    )
+    print(auth_response.auth_url)
+    input("Press Enter to continue...")
+
+    # Poll for authorization status using the auth polling method
+    while auth_response.status != "completed":
+        # Wait before polling again to avoid spamming the server
+        time.sleep(4)
+        auth_response = client.auth.status(auth_response)
+
+# Authorization is completed; proceed with obtaining credentials
+creds = Credentials(auth_response.context.token)
+api_resource = build_resource_service(credentials=creds)
+toolkit = GmailToolkit(api_resource=api_resource)
+
+# Step 4: Get available tools
+tools = toolkit.get_tools()
+
+# Step 5: Initialize the LLM and create an agent
+llm = ChatOpenAI(model="gpt-4o")
+agent_executor = create_react_agent(llm, tools)
+
+# Step 6: Draft an email using the agent
+example_query = "Read my latest emails to me and summarize them."
+events = agent_executor.stream(
+    {"messages": [("user", example_query)]},
+    stream_mode="values",
+)
+for event in events:
+    event["messages"][-1].pretty_print()
diff --git a/examples/langchain/langgraph_with_tool_exec.py b/examples/langchain/langgraph_with_tool_exec.py
new file mode 100644
index 00000000..f98c69cb
--- /dev/null
+++ b/examples/langchain/langgraph_with_tool_exec.py
@@ -0,0 +1,63 @@
+import json
+import os
+from typing import Any, TypedDict
+
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.errors import NodeInterrupt
+from langgraph.graph import END, START, StateGraph
+
+from arcade.client import Arcade
+
+client = Arcade(api_key=os.environ["ARCADE_API_KEY"])
+
+
+class State(TypedDict):
+    emails: Any
+
+
+def step_1(state: State, config) -> State:
+    user_id = config["configurable"]["user_id"]
+
+    challenge = client.tools.authorize(
+        tool_name="ListEmails",
+        user_id=user_id,
+    )
+
+    if challenge.status != "completed":
+        raise NodeInterrupt(f"Please visit this URL to authorize: {challenge.auth_url}")
+
+    result = client.tools.run(
+        tool_name="ListEmails",
+        user_id=user_id,
+        tool_version="default",
+        inputs=json.dumps({"n_emails": 5}),
+    )
+    return {"emails": result}
+
+
+builder = StateGraph(State)
+builder.add_node("step_1", step_1)
+builder.add_edge(START, "step_1")
+builder.add_edge("step_1", END)
+
+# Set up memory
+memory = MemorySaver()
+
+# Compile the graph with memory
+graph = builder.compile(checkpointer=memory)
+
+config = {"configurable": {"thread_id": "2", "user_id": "sam@arcade-ai.com"}}
+result = graph.invoke({"emails": None}, config=config)
+state = graph.get_state({"configurable": {"thread_id": "2"}})
+print("interrupted state\n----------")
+print(state)
+print("----------")
+input()
+result = graph.invoke({"emails": None}, config=config)
+state = graph.get_state({"configurable": {"thread_id": "2"}})
+print("final state\n----------")
+print(state)
+print("----------")
+print("final result\n----------")
+print(result)
+print("----------")
diff --git a/examples/modal-deploy.py b/examples/modal-deploy.py
index 900fbc98..ba8e014c 100644
--- a/examples/modal-deploy.py
+++ b/examples/modal-deploy.py
@@ -2,7 +2,7 @@ import os
 
 from modal import App, Image, asgi_app
 
-os.environ["WORK_DIR"] = "/root"
+os.environ["ARCADE_WORK_DIR"] = "/root"
 
 # Define the FastAPI app
 app = App("arcade-ai-actor")
diff --git a/toolkits/google/pyproject.toml b/toolkits/google/pyproject.toml
index 9b3ff4b3..cc7888bd 100644
--- a/toolkits/google/pyproject.toml
+++ b/toolkits/google/pyproject.toml
@@ -6,7 +6,7 @@ authors = ["Sam Partee <sam@arcade-ai.com>", "Eric Gustin <eric@arcade-ai.com>"]
 
 [tool.poetry.dependencies]
 python = "^3.10"
-arcade-ai = "*"
+arcade-ai = "^0.1.0"
 google-api-core = "2.19.1"
 google-api-python-client = "2.137.0"
 google-auth = "2.32.0"
@@ -16,7 +16,7 @@ googleapis-common-protos = "1.63.2"
 beautifulsoup4 = "^4.10.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.4.0"
+pytest = "^8.3.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/toolkits/math/evals/eval_arithmetic_tools.py b/toolkits/math/evals/eval_math_tools.py
similarity index 81%
rename from toolkits/math/evals/eval_arithmetic_tools.py
rename to toolkits/math/evals/eval_math_tools.py
index 1e077bd0..6f60656c 100644
--- a/toolkits/math/evals/eval_arithmetic_tools.py
+++ b/toolkits/math/evals/eval_math_tools.py
@@ -1,12 +1,11 @@
-from arcade.core.catalog import ToolCatalog
-from arcade.core.toolkit import Toolkit
 import arcade_math
+from arcade_math.tools.arithmetic import add, sqrt
 
+from arcade.core.catalog import ToolCatalog
 from arcade.sdk.eval import (
     BinaryCritic,
     EvalRubric,
     EvalSuite,
-    ExpectedToolCall,
     tool_eval,
 )
 
@@ -18,11 +17,11 @@ rubric = EvalRubric(
 
 
 catalog = ToolCatalog()
-catalog.add_toolkit(Toolkit.from_module(arcade_math))
+catalog.add_module(arcade_math)
 
 
 @tool_eval()
-def arithmetic_eval_suite():
+def math_eval_suite():
     suite = EvalSuite(
         name="Math Tools Evaluation",
         system_message="You are an AI assistant with access to math tools. Use them to help the user with their math-related tasks.",
@@ -34,9 +33,9 @@ def arithmetic_eval_suite():
         name="Add two large numbers",
         user_message="Add 12345 and 987654321",
         expected_tool_calls=[
-            ExpectedToolCall(
-                "Arithmetic_Add",
-                args={
+            (
+                add,
+                {
                     "a": 12345,
                     "b": 987654321,
                 },
@@ -55,7 +54,12 @@ def arithmetic_eval_suite():
         name="Take the square root of a large number",
         user_message="What is the square root of 3224990521?",
         expected_tool_calls=[
-            ExpectedToolCall("Arithmetic_Sqrt", args={"a": 3224990521})
+            (
+                sqrt,
+                {
+                    "a": 3224990521,
+                },
+            )
         ],
         rubric=rubric,
         critics=[
diff --git a/toolkits/math/pyproject.toml b/toolkits/math/pyproject.toml
index 0e42e230..47fb3e97 100644
--- a/toolkits/math/pyproject.toml
+++ b/toolkits/math/pyproject.toml
@@ -7,10 +7,10 @@ authors = ["Nate <nate@arcade-ai.com>"]
 
 [tool.poetry.dependencies]
 python = "^3.10"
-arcade-ai = "*"
+arcade-ai = "^0.1.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.4"
+pytest = "^8.3.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/toolkits/search/evals/eval_google_search.py b/toolkits/search/evals/eval_google_search.py
new file mode 100644
index 00000000..a776e98b
--- /dev/null
+++ b/toolkits/search/evals/eval_google_search.py
@@ -0,0 +1,239 @@
+import arcade_search
+from arcade_search.tools.google import search_google
+
+from arcade.core.catalog import ToolCatalog
+from arcade.sdk.eval import (
+    EvalRubric,
+    EvalSuite,
+    NumericCritic,
+    SimilarityCritic,
+    tool_eval,
+)
+
+# Evaluation rubric
+rubric = EvalRubric(
+    fail_threshold=0.8,
+    warn_threshold=0.9,
+)
+
+catalog = ToolCatalog()
+# Register the Google Search tool
+catalog.add_module(arcade_search)
+
+
+@tool_eval()
+def google_search_eval_suite() -> EvalSuite:
+    """Create an evaluation suite for the Google Search tool."""
+    suite = EvalSuite(
+        name="Google Search Tool Evaluation",
+        system_message="You are an AI assistant that can perform web searches using the provided tools.",
+        catalog=catalog,
+        rubric=rubric,
+    )
+
+    # Simple search query with default results
+    suite.add_case(
+        name="Simple search query with default results",
+        user_message="Search for 'Climate change effects on polar bears' on Google.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "Climate change effects on polar bears",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search query with specific number of results
+    suite.add_case(
+        name="Search query with specific number of results",
+        user_message="Find the top 3 articles about quantum computing.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "articles about quantum computing",
+                    "n_results": 3,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=0.7),
+            NumericCritic(
+                critic_field="n_results",
+                weight=0.3,
+                value_range=(1, 100),
+            ),
+        ],
+    )
+
+    # Search query with 'n' results specified in words
+    suite.add_case(
+        name="Search query with 'n' results specified in words",
+        user_message="Give me five recipes for vegan lasagna.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "recipes for vegan lasagna",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=0.7),
+            NumericCritic(
+                critic_field="n_results",
+                weight=0.3,
+                value_range=(1, 100),
+            ),
+        ],
+    )
+
+    # Ambiguous number of results
+    suite.add_case(
+        name="Ambiguous number of results",
+        user_message="Find articles about climate change impacts 10.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "articles about climate change impacts 10",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search query with multiple instructions
+    suite.add_case(
+        name="Search query with multiple instructions",
+        user_message="Search for the latest news on electric cars, and tell me about Tesla's new model.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "latest news on electric cars",
+                    "n_results": 5,
+                },
+            ),
+            (
+                search_google,
+                {
+                    "query": "Tesla's new model",
+                    "n_results": 5,
+                },
+            ),
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search with stop words and filler words
+    suite.add_case(
+        name="Search with stop words and filler words",
+        user_message="Could you please search for the best ways to learn French?",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "best ways to learn French",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # No clear query given
+    suite.add_case(
+        name="No clear query given",
+        user_message="Find it for me.",
+        expected_tool_calls=[],
+        critics=[],
+    )
+
+    # Search query with special characters
+    suite.add_case(
+        name="Search query with special characters",
+        user_message="Find me '@OpenAI's latest research papers'",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "@OpenAI's latest research papers",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search query with complex instructions
+    suite.add_case(
+        name="Search query with complex instructions",
+        user_message="I need information about the impact of deforestation in the Amazon over the past decade.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "impact of deforestation in the Amazon over the past decade",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search query in a different language
+    suite.add_case(
+        name="Search query in a different language",
+        user_message="Busca información sobre la economía de España.",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "economía de España",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    # Search query with numeric data
+    suite.add_case(
+        name="Search query with numeric data",
+        user_message="What was the population of Japan in 2020?",
+        expected_tool_calls=[
+            (
+                search_google,
+                {
+                    "query": "population of Japan in 2020",
+                    "n_results": 5,
+                },
+            )
+        ],
+        critics=[
+            SimilarityCritic(critic_field="query", weight=1.0),
+        ],
+    )
+
+    return suite
diff --git a/toolkits/search/pyproject.toml b/toolkits/search/pyproject.toml
index 319e0812..8f34835b 100644
--- a/toolkits/search/pyproject.toml
+++ b/toolkits/search/pyproject.toml
@@ -6,11 +6,11 @@ authors = ["Sam Partee <sam@arcade-ai.com>"]
 
 [tool.poetry.dependencies]
 python = "^3.10"
-arcade-ai = "*"
+arcade-ai = "^0.1.0"
 serpapi = "^0.1.5"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.4.0"
+pytest = "^8.3.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/toolkits/slack/evals/eval_slack_messaging.py b/toolkits/slack/evals/eval_slack_messaging.py
index 3600b2c8..cbe0b0e2 100644
--- a/toolkits/slack/evals/eval_slack_messaging.py
+++ b/toolkits/slack/evals/eval_slack_messaging.py
@@ -1,3 +1,4 @@
+import arcade_slack
 from arcade_slack.tools.chat import send_dm_to_user, send_message_to_channel
 
 from arcade.core.catalog import ToolCatalog
@@ -5,7 +6,6 @@ from arcade.sdk.eval import (
     BinaryCritic,
     EvalRubric,
     EvalSuite,
-    ExpectedToolCall,
     SimilarityCritic,
     tool_eval,
 )
@@ -19,8 +19,7 @@ rubric = EvalRubric(
 
 catalog = ToolCatalog()
 # Register the Slack tools
-catalog.add_tool(send_dm_to_user)
-catalog.add_tool(send_message_to_channel)
+catalog.add_module(arcade_slack)
 
 
 @tool_eval()
@@ -38,9 +37,9 @@ def slack_eval_suite() -> EvalSuite:
         name="Send DM to user with clear username",
         user_message="Send a direct message to johndoe saying 'Hello, can we meet at 3 PM?'",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendDmToUser",
-                args={
+            (
+                send_dm_to_user,
+                {
                     "user_name": "johndoe",
                     "message": "Hello, can we meet at 3 PM?",
                 },
@@ -56,54 +55,54 @@ def slack_eval_suite() -> EvalSuite:
         name="Send DM with ambiguous username",
         user_message="Message John about the project deadline",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendDmToUser",
-                args={
+            (
+                send_dm_to_user,
+                {
                     "user_name": "john",
                     "message": "Hi John, I wanted to check about the project deadline. Can you provide an update?",
                 },
             )
         ],
-        critics=[
-            SimilarityCritic(critic_field="user_name", weight=0.6),
-            SimilarityCritic(critic_field="message", weight=0.4),
-        ],
-    )
-
-    suite.add_case(
-        name="Send DM with username in different format",
-        user_message="DM Jane.Doe to reschedule our meeting",
-        expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendDmToUser",
-                args={
-                    "user_name": "jane.doe",
-                    "message": "Hi Jane, I need to reschedule our meeting. When are you available?",
-                },
-            )
-        ],
         critics=[
             BinaryCritic(critic_field="user_name", weight=0.6),
             SimilarityCritic(critic_field="message", weight=0.4),
         ],
     )
 
+    suite.add_case(
+        name="Send DM with username in different format",
+        user_message="DM Jane.Doe to reschedule our meeting",
+        expected_tool_calls=[
+            (
+                send_dm_to_user,
+                {
+                    "user_name": "jane.doe",
+                    "message": "Hi Jane, I need to reschedule our meeting. When are you available?",
+                },
+            )
+        ],
+        critics=[
+            BinaryCritic(critic_field="user_name", weight=0.5),
+            SimilarityCritic(critic_field="message", weight=0.5),
+        ],
+    )
+
     # Send Message to Channel Scenarios
     suite.add_case(
         name="Send message to channel with clear name",
         user_message="Post 'The new feature is now live!' in the #announcements channel",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendMessageToChannel",
-                args={
+            (
+                send_message_to_channel,
+                {
                     "channel_name": "announcements",
                     "message": "The new feature is now live!",
                 },
             )
         ],
         critics=[
-            BinaryCritic(critic_field="channel_name", weight=0.6),
-            SimilarityCritic(critic_field="message", weight=0.4),
+            BinaryCritic(critic_field="channel_name", weight=0.5),
+            SimilarityCritic(critic_field="message", weight=0.5),
         ],
     )
 
@@ -111,9 +110,9 @@ def slack_eval_suite() -> EvalSuite:
         name="Send message to channel with ambiguous name",
         user_message="Inform the engineering team about the upcoming maintenance in the general channel",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendMessageToChannel",
-                args={
+            (
+                send_message_to_channel,
+                {
                     "channel_name": "engineering",
                     "message": "Attention team: There will be upcoming maintenance. Please save your work and expect some downtime.",
                 },
@@ -130,9 +129,9 @@ def slack_eval_suite() -> EvalSuite:
         name="Ambiguous between DM and channel message",
         user_message="Send 'Great job on the presentation!' to the team",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendMessageToChannel",
-                args={
+            (
+                send_message_to_channel,
+                {
                     "channel_name": "general",
                     "message": "Great job on the presentation!",
                 },
@@ -149,25 +148,25 @@ def slack_eval_suite() -> EvalSuite:
         name="Multiple recipients in DM request",
         user_message="Send a DM to Alice and Bob about pushing the meeting tomorrow. I have to much work to do.",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendDmToUser",
-                args={
+            (
+                send_dm_to_user,
+                {
                     "user_name": "alice",
                     "message": "Hi Alice, about our meeting tomorrow, let's reschedule? I am swamped with work.",
                 },
             ),
-            ExpectedToolCall(
-                name="SendDmToUser",
-                args={
+            (
+                send_dm_to_user,
+                {
                     "user_name": "bob",
                     "message": "Hi Bob, about our meeting tomorrow, let's reschedule? I am swamped with work.",
                 },
             ),
         ],
         critics=[
-            SimilarityCritic(critic_field="user_name", weight=0.6),
+            SimilarityCritic(critic_field="user_name", weight=0.7),
             SimilarityCritic(
-                critic_field="message", weight=0.4, similarity_threshold=0.7
+                critic_field="message", weight=0.3, similarity_threshold=0.6
             ),
         ],
     )
@@ -176,9 +175,9 @@ def slack_eval_suite() -> EvalSuite:
         name="Channel name similar to username",
         user_message="Post 'sounds great!' in john-project channel",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="SendMessageToChannel",
-                args={
+            (
+                send_message_to_channel,
+                {
                     "channel_name": "john-project",
                     "message": "Sounds great!",
                 },
diff --git a/toolkits/slack/pyproject.toml b/toolkits/slack/pyproject.toml
index 8efae88d..c35eaf73 100644
--- a/toolkits/slack/pyproject.toml
+++ b/toolkits/slack/pyproject.toml
@@ -10,7 +10,7 @@ arcade-ai = "^0.1.0"
 slack-sdk = "^3.31.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.4.0"
+pytest = "^8.3.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/toolkits/x/evals/eval_x_tools.py b/toolkits/x/evals/eval_x_tools.py
index 6ba16c45..8fb48815 100644
--- a/toolkits/x/evals/eval_x_tools.py
+++ b/toolkits/x/evals/eval_x_tools.py
@@ -1,17 +1,16 @@
+import arcade_x
+from arcade_x.tools.tweets import post_tweet
+
+# TODO
+# delete_tweet_by_id,
+# search_recent_tweets_by_keywords,
+# search_recent_tweets_by_username,
+# from arcade_x.tools.users import lookup_single_user_by_username
 from arcade.core.catalog import ToolCatalog
-from arcade_x.tools.tweets import (
-    post_tweet,
-    delete_tweet_by_id,
-    # search_recent_tweets_by_query,
-    search_recent_tweets_by_username,
-    search_recent_tweets_by_keywords,
-)
-from arcade_x.tools.users import lookup_single_user_by_username
 from arcade.sdk.eval import (
-    BinaryCritic,
     EvalRubric,
     EvalSuite,
-    ExpectedToolCall,
+    SimilarityCritic,
     tool_eval,
 )
 
@@ -22,11 +21,8 @@ rubric = EvalRubric(
 )
 
 catalog = ToolCatalog()
-catalog.add_tool(search_recent_tweets_by_keywords)
-catalog.add_tool(lookup_single_user_by_username)
-catalog.add_tool(post_tweet)
-catalog.add_tool(delete_tweet_by_id)
-catalog.add_tool(search_recent_tweets_by_username)
+# Register the X tools
+catalog.add_module(arcade_x)
 
 
 @tool_eval()
@@ -45,17 +41,18 @@ def x_eval_suite() -> EvalSuite:
         name="Post a tweet",
         user_message="Send out a tweet that says 'Hello World! Exciting stuff is happening over at Arcade AI!'",
         expected_tool_calls=[
-            ExpectedToolCall(
-                name="PostTweet",
-                args={
+            (
+                post_tweet,
+                {
                     "tweet_text": "Hello World! Exciting stuff is happening over at Arcade AI!"
                 },
             )
         ],
         critics=[
-            BinaryCritic(
+            SimilarityCritic(
                 critic_field="tweet_text",
                 weight=1.0,
+                similarity_threshold=0.9,
             ),
         ],
     )