<!-- CURSOR_SUMMARY --> > [!NOTE] > **Medium Risk** > Touches multiple toolkits’ runtime entrypoints and context/error/auth plumbing, so breakage risk is mainly around invocation/packaging and tool execution wiring rather than business logic. > > **Overview** > Migrates the BrightData, ClickHouse, LinkedIn, Math, MongoDB, Postgres, and Zendesk OSS toolkits from `arcade-tdk` to `arcade-mcp-server` APIs by updating tool decorators, `Context` types, auth classes, and exception imports. > > Adds per-toolkit `__main__.py` files that construct an `MCPApp`, register module tools, and run via configurable transport/host/port; corresponding `pyproject.toml` updates bump versions, drop `arcade-tdk`/`arcade-serve` deps, and add `project.scripts` console entrypoints. > > Updates tests and eval suites to use `arcade_mcp_server.Context` (mocked) and switches eval `ToolCatalog` imports to `arcade_core`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 9b3e31acb4b35e1d72efd47e2d279c5b19e3ecb0. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
190 lines
5.9 KiB
Python
190 lines
5.9 KiB
Python
# RUN ME WITH `uv run arcade evals evals --host api.arcade.dev`
|
|
|
|
import arcade_mongodb
|
|
from arcade_core import ToolCatalog
|
|
from arcade_evals import (
|
|
BinaryCritic,
|
|
EvalRubric,
|
|
EvalSuite,
|
|
ExpectedToolCall,
|
|
SimilarityCritic,
|
|
tool_eval,
|
|
)
|
|
from arcade_mongodb.tools.mongodb import (
|
|
aggregate_documents,
|
|
count_documents,
|
|
discover_collections,
|
|
discover_databases,
|
|
find_documents,
|
|
get_collection_schema,
|
|
)
|
|
|
|
# Evaluation rubric
|
|
rubric = EvalRubric(
|
|
fail_threshold=0.85,
|
|
warn_threshold=0.95,
|
|
)
|
|
|
|
|
|
catalog = ToolCatalog()
|
|
catalog.add_module(arcade_mongodb)
|
|
|
|
|
|
@tool_eval()
|
|
def mongodb_eval_suite() -> EvalSuite:
|
|
suite = EvalSuite(
|
|
name="MongoDB Tools Evaluation",
|
|
system_message=(
|
|
"You are an AI assistant with access to MongoDB tools. "
|
|
"Use them to help the user with their tasks."
|
|
),
|
|
catalog=catalog,
|
|
rubric=rubric,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Discover databases",
|
|
user_message="What databases are available in my MongoDB instance?",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(func=discover_databases, args={}),
|
|
],
|
|
rubric=rubric,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Discover collections",
|
|
user_message="What collections are in the 'admin' database?",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(func=discover_collections, args={"database_name": "admin"}),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=1.0),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Get collection schema (single tool call)",
|
|
user_message="Get the schema of the 'system.users' collection in the 'admin' database.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=get_collection_schema,
|
|
args={"database_name": "admin", "collection_name": "system.users"},
|
|
),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=0.5),
|
|
BinaryCritic(critic_field="collection_name", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Find documents (direct call)",
|
|
user_message="Find documents in the 'startup_log' collection of the 'local' database, limited to 5 results.",
|
|
additional_messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "You can call find_documents directly without discovering collections first for this test.",
|
|
}
|
|
],
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=find_documents,
|
|
args={
|
|
"database_name": "local",
|
|
"collection_name": "startup_log",
|
|
"limit": 5,
|
|
},
|
|
),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=0.33),
|
|
BinaryCritic(critic_field="collection_name", weight=0.33),
|
|
BinaryCritic(critic_field="limit", weight=0.34),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Count documents",
|
|
user_message="Count all documents in the 'startup_log' collection of the 'local' database.",
|
|
additional_messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "You can call count_documents directly without discovering collections first for this test.",
|
|
}
|
|
],
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=count_documents,
|
|
args={
|
|
"database_name": "local",
|
|
"collection_name": "startup_log",
|
|
},
|
|
),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=0.5),
|
|
BinaryCritic(critic_field="collection_name", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Count documents with filter",
|
|
user_message="Count documents in the 'startup_log' collection of the 'local' database where the level is 'INFO'.",
|
|
additional_messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "You can call count_documents directly without discovering collections first for this test.",
|
|
}
|
|
],
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=count_documents,
|
|
args={
|
|
"database_name": "local",
|
|
"collection_name": "startup_log",
|
|
"filter_dict": '{"level": "INFO"}',
|
|
},
|
|
),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=0.25),
|
|
BinaryCritic(critic_field="collection_name", weight=0.25),
|
|
SimilarityCritic(critic_field="filter_dict", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Aggregate documents",
|
|
user_message="Group documents in the 'startup_log' collection of the 'local' database by level and count them.",
|
|
additional_messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "You can call aggregate_documents directly without discovering collections first for this test.",
|
|
}
|
|
],
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=aggregate_documents,
|
|
args={
|
|
"database_name": "local",
|
|
"collection_name": "startup_log",
|
|
"pipeline": [
|
|
'{"$group": {"_id": "$level", "count": {"$sum": 1}}}',
|
|
],
|
|
},
|
|
),
|
|
],
|
|
rubric=rubric,
|
|
critics=[
|
|
BinaryCritic(critic_field="database_name", weight=0.2),
|
|
BinaryCritic(critic_field="collection_name", weight=0.2),
|
|
SimilarityCritic(critic_field="pipeline", weight=0.6),
|
|
],
|
|
)
|
|
|
|
return suite
|