arcade-mcp/examples/evals/eval_http_mcp_server.py

"""Remote HTTP/SSE MCP server evaluation.

This example demonstrates loading and evaluating tools from remote MCP servers
accessible via HTTP or Server-Sent Events (SSE).

NOTE: This requires a running HTTP MCP server. Update the configuration below
with your server details.

Run:
    arcade evals examples/evals/eval_http_mcp_server.py \\
        -p openai:gpt-4o \\
        -k openai:YOUR_KEY \\
        -o results.html -d
"""

import asyncio
import os

from arcade_evals import (
    BinaryCritic,
    EvalRubric,
    EvalSuite,
    ExpectedMCPToolCall,
    tool_eval,
)

# =============================================================================
# CONFIGURATION - Update these for your HTTP MCP server
# =============================================================================

# Example: GitHub Copilot MCP (requires GitHub token)
HTTP_MCP_URL = os.environ.get("MCP_SERVER_URL", "https://api.githubcopilot.com/mcp/")
HTTP_MCP_TOKEN = os.environ.get("GITHUB_PAT", "YOUR_GITHUB_TOKEN_HERE")

# Example: SSE-based MCP server
SSE_MCP_URL = os.environ.get("SSE_MCP_URL", "https://mcp.example.com/sse")

default_rubric = EvalRubric(
    fail_threshold=0.7,
    warn_threshold=0.9,
)


# =============================================================================
# EVAL SUITE - HTTP MCP Server
# =============================================================================


@tool_eval()
async def eval_http_mcp_server() -> EvalSuite:
    """Evaluate tools from HTTP MCP server."""
    suite = EvalSuite(
        name="HTTP MCP Server Evaluation",
        system_message="You are a helpful assistant with access to remote tools.",
        rubric=default_rubric,
    )

    print("\n  Loading HTTP MCP server...")

    try:
        await asyncio.wait_for(
            suite.add_mcp_server(
                url=HTTP_MCP_URL,
                headers={"Authorization": f"Bearer {HTTP_MCP_TOKEN}"},
                use_sse=False,  # Use HTTP streaming
            ),
            timeout=15.0,
        )
        print("  ✓ HTTP MCP server")
    except asyncio.TimeoutError:
        print("  ✗ HTTP MCP server - timeout")
        return suite
    except Exception as e:
        print(f"  ✗ HTTP MCP server - {type(e).__name__}: {e}")
        return suite

    # Add test cases based on your server's tools
    # Example: If your server has an echo tool
    suite.add_case(
        name="HTTP server tool call",
        user_message="Echo 'Hello from HTTP'",
        expected_tool_calls=[
            ExpectedMCPToolCall(
                tool_name="echo",  # Adjust to match your server's tool names
                args={"message": "Hello from HTTP"},
            )
        ],
        critics=[
            BinaryCritic(critic_field="message", weight=1.0),
        ],
    )

    return suite


# =============================================================================
# EVAL SUITE - SSE MCP Server
# =============================================================================


@tool_eval()
async def eval_sse_mcp_server() -> EvalSuite:
    """Evaluate tools from SSE MCP server."""
    suite = EvalSuite(
        name="SSE MCP Server Evaluation",
        system_message="You are a helpful assistant with access to SSE-connected tools.",
        rubric=default_rubric,
    )

    print("\n  Loading SSE MCP server...")

    try:
        await asyncio.wait_for(
            suite.add_mcp_server(
                url=SSE_MCP_URL,
                use_sse=True,  # Use SSE transport
                headers={"Accept": "text/event-stream"},
            ),
            timeout=15.0,
        )
        print("  ✓ SSE MCP server")
    except asyncio.TimeoutError:
        print("  ✗ SSE MCP server - timeout")
        return suite
    except Exception as e:
        print(f"  ✗ SSE MCP server - {type(e).__name__}: {e}")
        return suite

    # Add test cases for your SSE server's tools
    suite.add_case(
        name="SSE server tool call",
        user_message="Get status",
        expected_tool_calls=[
            ExpectedMCPToolCall(
                tool_name="get_status",  # Adjust to match your server's tools
                args={},
            )
        ],
        critics=[],
    )

    return suite