arcade-mcp/toolkits/github/evals/eval_github_pull_requests.py
Sam Partee b6b4cd0a4c
🏗️ Restructure: Multi-Package Architecture + uv Migration (#412)
### Overview
Major restructuring from monolithic `arcade-ai` package to modular
library architecture with standardized uv-based dependency management.

![arcade-ai Monorepo
(2)](https://github.com/user-attachments/assets/25f102b0-bb87-4a04-9701-d227d05664b1)

### New Package Structure
- **`arcade-tdk`** - Lightweight toolkit development kit (core
decorators, auth)
- **`arcade-core`** - Core execution engine and catalog functionality  
- **`arcade-serve`** - FastAPI/MCP server components
- **`arcade-ai`** - Meta package that includes CLI functionality.
Optionally include evals via the `evals` extra. Optionally include all
packages via the `all` extra.

### Key Benefits
- **Lighter Dependencies**: Toolkits now depend only on `arcade-tdk` (~2
deps) vs full `arcade-ai` (~30+ deps)
- **Faster Builds**: uv provides 10-100x faster dependency resolution
and installation
- **Better Modularity**: Clear separation of concerns, consumers import
only what they need
- **Standard Tooling**: Eliminates custom poetry scripts, uses standard
Python packaging

### Migration Impact
- All 20 toolkits converted from poetry → uv with `arcade-tdk`
dependencies plus `arcade-ai[evals]` and `arcade-serve` dev
dependencies. When developing locally, devs should install toolkits via
`make install-local`.
- Modern Python 3.10+ type hints throughout
- Standardized build system with hatchling backend
- Enhanced Makefile with robust toolkit management commands
- Removed `arcade dev` CLI command
- Reduce the number of files created by `arcade new` and add an option
to not generate a tests and evals folder.

This foundation enables faster development cycles and cleaner dependency
chains for the growing toolkit ecosystem.

### Todo After this PR is merged
- [ ] Post-merge workflow(s) (release & publish containers, etc)
- [ ] Release order plan. @EricGustin suggests releasing in the
following order:
    1. `arcade-core` version 0.1.0
    2. `arcade-serve` version 0.1.0 and `arcade-tdk` version 0.1.0
    3. `arcade-ai` version 2.0.0
4. Patch release for all toolkits (all changes in toolkits are internal
refactors)
- [ ] [Update docs](https://github.com/ArcadeAI/docs/pull/318)

---------

Co-authored-by: Eric Gustin <eric@arcade.dev>
Co-authored-by: Eric Gustin <34000337+EricGustin@users.noreply.github.com>
2025-06-11 16:48:17 -07:00

250 lines
8.9 KiB
Python

from arcade_evals import (
BinaryCritic,
EvalRubric,
EvalSuite,
ExpectedToolCall,
SimilarityCritic,
tool_eval,
)
from arcade_tdk import ToolCatalog
import arcade_github
from arcade_github.tools.models import (
DiffSide,
ReviewCommentSubjectType,
SortDirection,
)
from arcade_github.tools.pull_requests import (
create_reply_for_review_comment,
create_review_comment,
get_pull_request,
list_pull_request_commits,
list_pull_requests,
list_review_comments_on_pull_request,
update_pull_request,
)
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.9,
warn_threshold=0.95,
)
catalog = ToolCatalog()
# Register the GitHub tools
catalog.add_module(arcade_github)
@tool_eval()
def github_pull_requests_eval_suite() -> EvalSuite:
"""Evaluation suite for GitHub Pull Requests tools."""
suite = EvalSuite(
name="GitHub Pull Requests Tools Evaluation Suite",
system_message="You are an AI assistant that helps users interact with GitHub pull requests using the provided tools.",
catalog=catalog,
rubric=rubric,
)
# List Pull Requests
suite.add_case(
name="List all open pull requests",
user_message="List all open pull requests in the test repository under the ArcadeAI account that are proposing to merge into main.",
expected_tool_calls=[
ExpectedToolCall(
func=list_pull_requests,
args={
"owner": "ArcadeAI",
"repo": "test",
"state": "open",
"base": "main",
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="state", weight=0.2),
BinaryCritic(critic_field="base", weight=0.1),
],
)
# Get Pull Request
suite.add_case(
name="Get details of a pull request",
user_message="Get diff of pull request #72 in the 'ArcadeAI/test' repository. Include all the data that is available in your response.",
expected_tool_calls=[
ExpectedToolCall(
func=get_pull_request,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"include_diff_content": True,
"include_extra_data": True,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="pull_number", weight=0.3),
BinaryCritic(critic_field="include_extra_data", weight=0.1),
BinaryCritic(critic_field="include_diff_content", weight=0.2),
],
)
# Update Pull Request
suite.add_case(
name="Update a pull request",
user_message="Update the title of pull request #72 in the 'ArcadeAI/test' repository to 'Updated Title'.",
expected_tool_calls=[
ExpectedToolCall(
func=update_pull_request,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"title": "Updated Title",
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="pull_number", weight=0.3),
BinaryCritic(critic_field="title", weight=0.3),
],
)
# List Pull Request Commits
suite.add_case(
name="List commits on a pull request",
user_message="List all commits for PR 72 in the test repository under ArcadeAI.",
expected_tool_calls=[
ExpectedToolCall(
func=list_pull_request_commits,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="pull_number", weight=0.3),
],
)
# Create Reply for Review Comment
suite.add_case(
name="Create a reply to a review comment",
user_message="Create a reply to the review comment 1778019974 in 'ArcadeAI/test' for pr 72 saying 'Thanks for the suggestion.'",
expected_tool_calls=[
ExpectedToolCall(
func=create_reply_for_review_comment,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"comment_id": 1778019974,
"body": "Thanks for the suggestion.",
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="pull_number", weight=0.2),
BinaryCritic(critic_field="comment_id", weight=0.2),
SimilarityCritic(critic_field="body", weight=0.2),
],
)
# List Review Comments on Pull Request
suite.add_case(
name="List all review comments on a pull request",
user_message="List review comments for pr 72 in the ArcadeAI/test repo. Sort by updated time in ascending order.",
expected_tool_calls=[
ExpectedToolCall(
func=list_review_comments_on_pull_request,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"sort": "updated",
"direction": SortDirection.ASC,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.2),
BinaryCritic(critic_field="repo", weight=0.2),
BinaryCritic(critic_field="pull_number", weight=0.2),
BinaryCritic(critic_field="sort", weight=0.2),
BinaryCritic(critic_field="direction", weight=0.2),
],
)
# Create Review Comment
suite.add_case(
name="Create a review comment on a pull request file",
user_message="Create a review comment on pr 72 in the 'ArcadeAI/test' repo. The comment should be on the file 'README.md' and says 'nit: you misspelled the word 'intelligence'",
expected_tool_calls=[
ExpectedToolCall(
func=create_review_comment,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"body": "nit: you misspelled the word 'intelligence'",
"path": "README.md",
"subject_type": ReviewCommentSubjectType.FILE,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.15),
BinaryCritic(critic_field="repo", weight=0.15),
BinaryCritic(critic_field="pull_number", weight=0.2),
SimilarityCritic(critic_field="body", weight=0.1),
BinaryCritic(critic_field="path", weight=0.2),
BinaryCritic(critic_field="subject_type", weight=0.2),
],
)
# Create Review Comment with Line Numbers
suite.add_case(
name="Create a review comment on specific lines of a pull request",
user_message="Create a review comment on pull request #72 in the 'ArcadeAI/test' repository. The comment should be on the file 'src/main.py', lines 10-15, and say 'Move these to constants.py.'",
expected_tool_calls=[
ExpectedToolCall(
func=create_review_comment,
args={
"owner": "ArcadeAI",
"repo": "test",
"pull_number": 72,
"body": "Move these to constants.py.",
"path": "src/main.py",
"start_line": 10,
"end_line": 15,
"side": DiffSide.RIGHT,
"subject_type": ReviewCommentSubjectType.LINE,
},
)
],
critics=[
BinaryCritic(critic_field="owner", weight=0.1),
BinaryCritic(critic_field="repo", weight=0.1),
BinaryCritic(critic_field="pull_number", weight=0.15),
SimilarityCritic(critic_field="body", weight=0.15),
BinaryCritic(critic_field="path", weight=0.1),
BinaryCritic(critic_field="start_line", weight=0.1),
BinaryCritic(critic_field="end_line", weight=0.1),
BinaryCritic(critic_field="side", weight=0.1),
BinaryCritic(critic_field="subject_type", weight=0.1),
],
)
return suite