arcade-mcp/toolkits/github/evals/eval_github_activity.py
Eric Gustin 7c228a59d5
Update Evals SDK (#175)
# PR Description
This PR renames `ExpectedToolCall` to `NamedExpectedToolCall` and then
creates a new dataclass called `ExpectedToolCall`. `ExpectedToolCall`
can be passed to the `EvalSuite.add_case` and `EvalSuite.extend_case`
methods.

1. Enhance `EvalSuite.add_case` and `EvalSuite.extend_case` by accepting
a list of `ExpectedToolCall` as their `expected_tool_calls` input
parameter. This helps create a scaffolding for developers. Previously,
the expected type was `list[tuple[Callable, dict[str, Any]]]`, which is
still valid for backward compatibility.
```python
# Before (still valid for backward compatibility)
expected_tool_calls=[
    (
        adjust_playback_position,
        {
            "absolute_position_ms": 10000,
        },
    )
]
        

# After
expected_tool_calls=[
    ExpectedToolCall(
        func=adjust_playback_position,
        args={"absolute_position_ms": 10000},
    )
]
```
2. Removed any references to arcade.core in toolkits directory.
3. Some linting for import organization.
2024-12-19 10:29:13 -08:00

115 lines
3.3 KiB
Python

import arcade_github
from arcade_github.tools.activity import list_stargazers, set_starred
from arcade.sdk import ToolCatalog
from arcade.sdk.eval import (
BinaryCritic,
EvalRubric,
EvalSuite,
ExpectedToolCall,
tool_eval,
)
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.9,
warn_threshold=0.95,
)
catalog = ToolCatalog()
# Register the GitHub tools
catalog.add_module(arcade_github)
@tool_eval()
def github_activity_eval_suite() -> EvalSuite:
"""Evaluation suite for GitHub Activity tools."""
suite = EvalSuite(
name="GitHub Activity Tools Evaluation Suite",
system_message="You are an AI assistant that helps users interact with GitHub repositories using the provided tools.",
catalog=catalog,
rubric=rubric,
)
# Set Starred
suite.add_case(
name="Star a repository",
user_message="Star the test repository that is owned by ArcadeAI.",
expected_tool_calls=[
ExpectedToolCall(
func=set_starred,
args={
"owner": "ArcadeAI",
"name": "test",
"starred": True,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.3),
BinaryCritic(critic_field="name", weight=0.3),
BinaryCritic(critic_field="starred", weight=0.4),
],
)
suite.add_case(
name="Unstar a repository",
user_message="Unstar the ArcadeAI/test repository.",
expected_tool_calls=[
ExpectedToolCall(
func=set_starred,
args={
"owner": "ArcadeAI",
"name": "test",
"starred": False,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.3),
BinaryCritic(critic_field="name", weight=0.3),
BinaryCritic(critic_field="starred", weight=0.4),
],
)
suite.add_case(
name="List stargazers for a repository",
user_message="List 42 stargazers for the ArcadeAI/arcade-ai repository.",
expected_tool_calls=[
ExpectedToolCall(
func=list_stargazers,
args={
"owner": "ArcadeAI",
"repo": "arcade-ai",
"limit": 42,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.3),
BinaryCritic(critic_field="repo", weight=0.3),
BinaryCritic(critic_field="limit", weight=0.4),
],
)
suite.add_case(
name="List stargazers for a repository",
user_message="List all of the stargazers for the ArcadeAI/arcade-ai repo",
expected_tool_calls=[
ExpectedToolCall(
func=list_stargazers,
args={
"owner": "ArcadeAI",
"repo": "arcade-ai",
"limit": None,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.3),
BinaryCritic(critic_field="repo", weight=0.3),
BinaryCritic(critic_field="limit", weight=0.4),
],
)
return suite