From a4160dd9fe15d78e152f0688f1a90cc2ee55c320 Mon Sep 17 00:00:00 2001
From: Eric Gustin <34000337+EricGustin@users.noreply.github.com>
Date: Thu, 29 Jan 2026 15:12:06 -0800
Subject: [PATCH] Four bug fixes (#754)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Resolves
[TOO-363](https://linear.app/arcadedev/issue/TOO-363/arcade-deploy-fails-when-additional-deps-are-added-to-the-server).
2. Resolves
[TOO-364](https://linear.app/arcadedev/issue/TOO-364/arcade-cores-tool-skip-logic-is-missing-case-for-direct-execution).
3. Resolves
[TOO-358](https://linear.app/arcadedev/issue/TOO-358/missing-evals-error-message-shows-wrong-command).
4. Resolves
[TOO-365](https://linear.app/arcadedev/issue/TOO-365/arcade-evals-unit-tests-are-hanging).

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Medium risk because it changes how `arcade deploy` spawns the server
process and adjusts toolkit discovery skip logic, which can affect
deployments and tool discovery; however, the changes are small and
covered by new unit/integration tests.
>
> **Overview**
> `arcade deploy` now starts the validation server using the project’s
`.venv` interpreter (via `find_python_interpreter`) instead of the CLI’s
own `sys.executable`, preventing missing dependency failures when the
CLI is installed in an isolated env.
>
> `arcade-core`’s `Toolkit.tools_from_directory` skip logic is hardened
to also skip the currently executing entrypoint by module name
(`__main__.__spec__.name`) when file paths don’t match (e.g., bundled
execution). CLI error printing now escapes plain messages to avoid rich
markup issues, and `arcade-evals` lock acquisition accepts an optional
timeout default.
>
> Adds unit tests for the new toolkit skip behavior and an integration
test that boots the MCP server via direct Python invocation to mirror
deployment behavior, and bumps `arcade-core`, `arcade-mcp-server`, and
root dependency versions accordingly.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
e7785634c231c059f2e0bd1bc73a56bd7470a494. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
---
 libs/arcade-cli/arcade_cli/configure.py       |  11 +-
 libs/arcade-cli/arcade_cli/deploy.py          |  10 +-
 libs/arcade-cli/arcade_cli/utils.py           |   2 +-
 libs/arcade-core/arcade_core/toolkit.py       |  46 ++++--
 libs/arcade-core/pyproject.toml               |   2 +-
 libs/arcade-evals/arcade_evals/loaders.py     |   6 +-
 libs/arcade-mcp-server/pyproject.toml         |   4 +-
 .../integration/test_end_to_end.py            | 102 ++++++++++++
 libs/tests/core/test_toolkit.py               | 151 ++++++++++++++++++
 pyproject.toml                                |   6 +-
 10 files changed, 313 insertions(+), 27 deletions(-)
diff --git a/libs/arcade-cli/arcade_cli/configure.py b/libs/arcade-cli/arcade_cli/configure.py
index 50ea47c9..a6599850 100644
--- a/libs/arcade-cli/arcade_cli/configure.py
+++ b/libs/arcade-cli/arcade_cli/configure.py
@@ -146,7 +146,16 @@ def get_tool_secrets() -> dict:
 
 
 def find_python_interpreter() -> Path:
-    # Find the Python interpreter in the virtual environment
+    """
+    Find the Python interpreter in the virtual environment.
+
+    NOTE: This function assumes it is called from the project root directory (where .venv lives).
+    Currently, callers like `arcade deploy` enforce this by requiring pyproject.toml in cwd.
+    If this requirement is relaxed in the future, this function should be updated to:
+      1. Accept a project_root parameter, OR
+      2. Honor VIRTUAL_ENV / UV_PROJECT_ENVIRONMENT env vars, OR
+      3. Search upward from cwd to find pyproject.toml and resolve .venv relative to that
+    """
     venv_python = None
     # Check for .venv first (uv default)
     if (Path.cwd() / ".venv").exists():
diff --git a/libs/arcade-cli/arcade_cli/deploy.py b/libs/arcade-cli/arcade_cli/deploy.py
index 82a64ff0..7b8308d5 100644
--- a/libs/arcade-cli/arcade_cli/deploy.py
+++ b/libs/arcade-cli/arcade_cli/deploy.py
@@ -4,7 +4,6 @@ import io
 import os
 import random
 import subprocess
-import sys
 import tarfile
 import time
 from collections import deque
@@ -22,6 +21,7 @@ from rich.spinner import Spinner
 from rich.text import Text
 from typing_extensions import Literal
 
+from arcade_cli.configure import find_python_interpreter
 from arcade_cli.secret import load_env_file
 from arcade_cli.utils import (
     compute_base_url,
@@ -388,7 +388,13 @@ def start_server_process(entrypoint: str, debug: bool = False) -> tuple[subproce
         "ARCADE_WORKER_SECRET": "temp-validation-secret",
     }
 
-    cmd = [sys.executable, entrypoint]
+    # Use the project's Python environment, not the CLI's isolated environment.
+    # find_python_interpreter() looks for .venv/bin/python in cwd, falling back to sys.executable.
+    # This ensures the server runs in the project's environment even when the CLI is installed
+    # in an isolated environment (e.g., via 'uv tool install arcade-mcp').
+    project_python = find_python_interpreter()
+    cmd = [str(project_python), entrypoint]
+
     process = subprocess.Popen(
         cmd,
         stdout=subprocess.PIPE,
diff --git a/libs/arcade-cli/arcade_cli/utils.py b/libs/arcade-cli/arcade_cli/utils.py
index c5818069..47914fc1 100644
--- a/libs/arcade-cli/arcade_cli/utils.py
+++ b/libs/arcade-cli/arcade_cli/utils.py
@@ -394,7 +394,7 @@ def handle_cli_error(
     elif error:
         console.print(f"❌ {message}: {escape(str(error))}", style="bold red")
     else:
-        console.print(f"❌ {message}", style="bold red")
+        console.print(f"❌ {escape(message)}", style="bold red")
 
     if should_exit:
         raise CLIError(message, error)
diff --git a/libs/arcade-core/arcade_core/toolkit.py b/libs/arcade-core/arcade_core/toolkit.py
index 6460a0a3..426f84e1 100644
--- a/libs/arcade-core/arcade_core/toolkit.py
+++ b/libs/arcade-core/arcade_core/toolkit.py
@@ -299,32 +299,50 @@ class Toolkit(BaseModel):
         # Skipping this file is necessary because tools are discovered via AST parsing, but those tools
         # aren't in the module's namespace yet since the file is still executing.
         current_file = None
+        current_module_name = None
         main_module = sys.modules.get("__main__")
-        if main_module and hasattr(main_module, "__file__") and main_module.__file__:
-            with contextlib.suppress(Exception):
-                current_file = Path(main_module.__file__).resolve()
+        if main_module:
+            if hasattr(main_module, "__file__") and main_module.__file__:
+                with contextlib.suppress(Exception):
+                    current_file = Path(main_module.__file__).resolve()
+            # Get module name from __spec__ if available (used when paths don't match,
+            # e.g., script runs from bundle but package is in site-packages)
+            main_spec = getattr(main_module, "__spec__", None)
+            if main_spec and main_spec.name:
+                current_module_name = main_spec.name
 
         tools: dict[str, list[str]] = {}
 
         for module_path in modules:
-            # Skip adding tools from the currently executing file
-            if current_file:
-                try:
-                    module_path_resolved = module_path.resolve()
-                    if module_path_resolved == current_file:
-                        continue
-                except Exception:  # noqa: S110
-                    pass
-
+            # Build import path first (needed for module name comparison in skip logic)
             relative_path = module_path.relative_to(package_dir)
-            cls.validate_file(module_path)
-            # Build import path and avoid duplicating the package prefix if it already exists
             relative_parts = relative_path.with_suffix("").parts
             import_path = ".".join(relative_parts)
             if relative_parts and relative_parts[0] == package_name:
                 full_import_path = import_path
             else:
                 full_import_path = f"{package_name}.{import_path}" if import_path else package_name
+
+            # Skip logic: check by file path OR by module name
+            # This handles cases where the script is run from a different location than
+            # where the package is installed (e.g., deployment scenarios)
+            should_skip = False
+            if current_file:
+                try:
+                    module_path_resolved = module_path.resolve()
+                    if module_path_resolved == current_file:
+                        should_skip = True
+                except Exception:  # noqa: S110
+                    pass
+
+            # Secondary check: compare module names when paths don't match
+            if not should_skip and current_module_name and full_import_path == current_module_name:
+                should_skip = True
+
+            if should_skip:
+                continue
+
+            cls.validate_file(module_path)
             tools[full_import_path] = get_tools_from_file(str(module_path))
 
         if not tools:
diff --git a/libs/arcade-core/pyproject.toml b/libs/arcade-core/pyproject.toml
index b98736cb..867ea6e8 100644
--- a/libs/arcade-core/pyproject.toml
+++ b/libs/arcade-core/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "arcade-core"
-version = "4.2.1"
+version = "4.2.2"
 description = "Arcade Core - Core library for Arcade platform"
 readme = "README.md"
 license = { text = "MIT" }
diff --git a/libs/arcade-evals/arcade_evals/loaders.py b/libs/arcade-evals/arcade_evals/loaders.py
index 882023d3..58150fb0 100644
--- a/libs/arcade-evals/arcade_evals/loaders.py
+++ b/libs/arcade-evals/arcade_evals/loaders.py
@@ -98,10 +98,10 @@ async def _get_cache_lock(cache_key: str) -> asyncio.Lock:
     return _cache_locks[cache_key]
 
 
-async def _acquire_lock_with_timeout(
-    lock: asyncio.Lock, timeout: float = LOCK_TIMEOUT_SECONDS
-) -> bool:
+async def _acquire_lock_with_timeout(lock: asyncio.Lock, timeout: float | None = None) -> bool:
     """Acquire a lock with timeout. Returns True if acquired, False on timeout."""
+    if timeout is None:
+        timeout = LOCK_TIMEOUT_SECONDS
     try:
         await asyncio.wait_for(lock.acquire(), timeout=timeout)
     except asyncio.TimeoutError:
diff --git a/libs/arcade-mcp-server/pyproject.toml b/libs/arcade-mcp-server/pyproject.toml
index 67be5f96..854158b3 100644
--- a/libs/arcade-mcp-server/pyproject.toml
+++ b/libs/arcade-mcp-server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "arcade-mcp-server"
-version = "1.15.0"
+version = "1.15.1"
 description = "Model Context Protocol (MCP) server framework for Arcade.dev"
 readme = "README.md"
 authors = [{ name = "Arcade.dev" }]
@@ -21,7 +21,7 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "arcade-core>=4.2.1,<5.0.0",
+    "arcade-core>=4.2.2,<5.0.0",
     "arcade-serve>=3.2.0,<4.0.0",
     "arcade-tdk>=3.4.0,<4.0.0",
     "arcadepy>=1.5.0",
diff --git a/libs/tests/arcade_mcp_server/integration/test_end_to_end.py b/libs/tests/arcade_mcp_server/integration/test_end_to_end.py
index d18e21fa..be461c88 100644
--- a/libs/tests/arcade_mcp_server/integration/test_end_to_end.py
+++ b/libs/tests/arcade_mcp_server/integration/test_end_to_end.py
@@ -82,6 +82,51 @@ def start_mcp_server(
         raise ValueError(f"Invalid transport: {transport}")
 
 
+def start_mcp_server_direct_python(
+    transport: str, port: int | None = None
+) -> tuple[subprocess.Popen, int | None]:
+    """
+    Start MCP server with direct Python invocation (simulates what happens in the Engine during deployment).
+
+    Args:
+        transport: Transport type ("stdio" or "http")
+        port: Port for HTTP transport (optional, will be random if not provided)
+
+    Returns:
+        Tuple of (process, port). Port is None for stdio transport.
+    """
+    entrypoint_path = get_entrypoint_path()
+    package_path = Path(__file__).parent / "server"
+
+    # Find Python in the server's venv
+    venv_python = package_path / ".venv" / "bin" / "python"
+    if not venv_python.exists():
+        pytest.skip("Server venv not found - run 'uv sync' in integration/server first")
+
+    if port is None:
+        port = random.randint(8000, 9000)  # noqa: S311
+
+    env = {
+        **os.environ,
+        "ARCADE_SERVER_HOST": "127.0.0.1",
+        "ARCADE_SERVER_PORT": str(port),
+        "ARCADE_SERVER_TRANSPORT": transport,
+        "ARCADE_AUTH_DISABLED": "true",
+        "ARCADE_WORKER_SECRET": "test-secret-direct",
+    }
+
+    cmd = [str(venv_python), entrypoint_path, transport]
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        env=env,
+        cwd=str(package_path),
+    )
+    return process, port if transport == "http" else None
+
+
 def wait_for_http_server_ready(port: int, timeout: int = 30) -> None:
     """
     Wait for HTTP server to become healthy.
@@ -913,3 +958,60 @@ async def test_http_mixed_route_concurrent_execution():
         except subprocess.TimeoutExpired:
             process.kill()
             process.wait()
+
+
+@pytest.mark.asyncio
+async def test_http_direct_python_invocation():
+    """Test server starts correctly with direct Python (simulates what happens in the Engine during deployment)"""
+    process, port = start_mcp_server_direct_python("http")
+    assert port is not None
+
+    try:
+        wait_for_http_server_ready(port, timeout=10)
+
+        # Verify server is healthy and tools are discoverable
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+        client = httpx.Client(base_url=f"http://127.0.0.1:{port}", timeout=10.0, headers=headers)
+
+        # Initialize
+        init_response = client.post(
+            "/mcp",
+            json=build_jsonrpc_request(
+                "initialize",
+                {
+                    "protocolVersion": "2025-06-18",
+                    "capabilities": {},
+                    "clientInfo": {"name": "test", "version": "1.0"},
+                },
+                request_id=1,
+            ),
+        )
+        assert init_response.status_code == 200
+
+        session_id = init_response.headers.get("mcp-session-id")
+        client.headers.update({"Mcp-Session-Id": session_id})
+
+        # Send initialized notification
+        init_notif = build_jsonrpc_request(
+            "notifications/initialized", params=None, request_id=None
+        )
+        client.post("/mcp", json=init_notif)
+
+        # List tools - should have 9 tools (including hello_world from entrypoint.py)
+        list_response = client.post("/mcp", json=build_jsonrpc_request("tools/list", request_id=2))
+        assert list_response.status_code == 200
+        tools = list_response.json()["result"]["tools"]
+        assert len(tools) == 9
+
+        client.close()
+
+    finally:
+        process.terminate()
+        try:
+            process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            process.wait()
diff --git a/libs/tests/core/test_toolkit.py b/libs/tests/core/test_toolkit.py
index 8cea12c5..3c362791 100644
--- a/libs/tests/core/test_toolkit.py
+++ b/libs/tests/core/test_toolkit.py
@@ -500,3 +500,154 @@ class TestValidPath:
     def test_invalid_paths(self, path_input):
         """Test that invalid paths are rejected."""
         assert Validate.path(path_input) is False
+
+
+class TestToolsFromDirectory:
+    """Test the tools_from_directory skip logic for entrypoint files.
+
+    It has two conditions:
+    1. File path match: module_path.resolve() == current_file
+    2. Module name match: full_import_path == current_module_name
+    """
+
+    @pytest.fixture
+    def temp_package(self, tmp_path):
+        """Create a temporary package with Python files containing tools."""
+        package_dir = tmp_path / "mypackage"
+        package_dir.mkdir()
+
+        (package_dir / "__init__.py").write_text("")
+        (package_dir / "entrypoint.py").write_text(
+            '''
+from arcade_mcp_server import tool
+
+@tool
+def my_tool():
+    """A tool."""
+    pass
+'''
+        )
+        tools_dir = package_dir / "tools"
+        tools_dir.mkdir()
+        (tools_dir / "__init__.py").write_text("")
+        (tools_dir / "helper.py").write_text(
+            '''
+from arcade_mcp_server import tool
+
+@tool
+def helper_tool():
+    """A helper tool."""
+    pass
+'''
+        )
+
+        return package_dir
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_skips_file_when_path_matches(self, mock_get_tools, temp_package):
+        """Verify skip when __main__.__file__ matches module path (original behavior)."""
+        mock_get_tools.return_value = ["some_tool"]
+        entrypoint_path = temp_package / "entrypoint.py"
+
+        # Create mock __main__ module with matching file path
+        mock_main = MagicMock()
+        mock_main.__file__ = str(entrypoint_path)
+        mock_main.__spec__ = None
+
+        with patch.dict("sys.modules", {"__main__": mock_main}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # entrypoint.py should be skipped, only tools/helper.py should be included
+        assert "mypackage.entrypoint" not in result
+        assert "mypackage.tools.helper" in result
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_skips_file_when_module_name_matches(self, mock_get_tools, temp_package):
+        """Verify skip when __main__.__spec__.name matches even if paths differ.
+
+        This simulates deployment scenarios where the script runs from a bundle
+        but the package is installed in site-packages (different paths).
+        """
+        mock_get_tools.return_value = ["some_tool"]
+
+        # Create mock __main__ module with different file path but matching module name
+        mock_main = MagicMock()
+        mock_main.__file__ = "/some/other/path/entrypoint.py"  # Different path
+        mock_spec = MagicMock()
+        mock_spec.name = "mypackage.entrypoint"  # but, matches the module being scanned
+        mock_main.__spec__ = mock_spec
+
+        with patch.dict("sys.modules", {"__main__": mock_main}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # entrypoint.py should be skipped due to module name match
+        assert "mypackage.entrypoint" not in result
+        assert "mypackage.tools.helper" in result
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_no_skip_when_different_module(self, mock_get_tools, temp_package):
+        """Verify unrelated modules are not skipped."""
+        mock_get_tools.return_value = ["some_tool"]
+
+        # Create mock __main__ module with completely different identity
+        mock_main = MagicMock()
+        mock_main.__file__ = "/some/other/path/other_script.py"
+        mock_spec = MagicMock()
+        mock_spec.name = "some_other_package.script"
+        mock_main.__spec__ = mock_spec
+
+        with patch.dict("sys.modules", {"__main__": mock_main}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # Both files should be included since neither matches __main__
+        assert "mypackage.entrypoint" in result
+        assert "mypackage.tools.helper" in result
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_no_skip_when_no_main_module(self, mock_get_tools, temp_package):
+        """Handle case where __main__ is not in sys.modules."""
+        mock_get_tools.return_value = ["some_tool"]
+
+        # Remove __main__ from sys.modules
+        with patch.dict("sys.modules", {"__main__": None}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # Both files should be included
+        assert "mypackage.entrypoint" in result
+        assert "mypackage.tools.helper" in result
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_no_skip_when_no_spec(self, mock_get_tools, temp_package):
+        """Handle case where __main__ has no __spec__ attribute."""
+        mock_get_tools.return_value = ["some_tool"]
+
+        # Create mock __main__ module with different file path and no __spec__
+        mock_main = MagicMock()
+        mock_main.__file__ = "/some/other/path/script.py"
+        mock_main.__spec__ = None
+
+        with patch.dict("sys.modules", {"__main__": mock_main}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # Both files should be included since path doesn't match and no spec
+        assert "mypackage.entrypoint" in result
+        assert "mypackage.tools.helper" in result
+
+    @patch("arcade_core.toolkit.get_tools_from_file")
+    def test_no_skip_when_spec_has_no_name(self, mock_get_tools, temp_package):
+        """Handle case where __main__.__spec__ exists but has no name."""
+        mock_get_tools.return_value = ["some_tool"]
+
+        # Create mock __main__ module with __spec__ but no name
+        mock_main = MagicMock()
+        mock_main.__file__ = "/some/other/path/script.py"
+        mock_spec = MagicMock()
+        mock_spec.name = None
+        mock_main.__spec__ = mock_spec
+
+        with patch.dict("sys.modules", {"__main__": mock_main}):
+            result = Toolkit.tools_from_directory(temp_package, "mypackage")
+
+        # Both files should be included
+        assert "mypackage.entrypoint" in result
+        assert "mypackage.tools.helper" in result
diff --git a/pyproject.toml b/pyproject.toml
index a89e33dd..af9fc6e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "arcade-mcp"
-version = "1.8.2"
+version = "1.8.3"
 description = "Arcade.dev - Tool Calling platform for Agents"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -19,8 +19,8 @@ requires-python = ">=3.10"
 
 dependencies = [
     # CLI dependencies
-    "arcade-mcp-server>=1.14.2,<2.0.0",
-    "arcade-core>=4.2.1,<5.0.0",
+    "arcade-mcp-server>=1.15.1,<2.0.0",
+    "arcade-core>=4.2.2,<5.0.0",
     "typer==0.10.0",
     "rich>=14.0.0,<15.0.0",
     "Jinja2==3.1.6",