arcade-mcp/toolkits/code_sandbox/evals/eval_e2b.py
Eric Gustin 8b46e4f7f9
Add Code Sandbox Tools (#114)
# PR Description
This PR creates a new toolkit called CodeSandbox. This toolkit has two
tools:
1. `RunCode`: Creates an E2B sandbox and runs the provided code in that
sandbox. Returns the execution logs, result, and errors. Supports
Python, JavaScript, R, Java, and Bash code.
2. `CreateStaticMatplotlibChart`: Creates a sandbox, runs the provided
python code that uses matplotlib, and returns the base64 encoded image
of the chart along with any logs or errors.
- I recommend not using `tool_choice="generate"` since the return object
contains a base64 image can be a lot of tokens that will not provide
much value to a generate's response.
    
    
    
Example of creating a pie chart:
```python
import base64
import json
import os

from openai import OpenAI


def call_tool_with_openai(client: OpenAI) -> dict:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": "There are 17 red apples, 4 green apples, and 10 yellow apples. Create a pie chart for this data.",
            },
        ],
        model="gpt-4o-mini",
        user="you@example.com",
        tools=["CodeSandbox.CreateStaticMatplotlibChart"],
        tool_choice="execute",
    )

    return response


arcade_api_key = os.environ.get("ARCADE_API_KEY")
cloud_host = "http://localhost:9099/v1"

openai_client = OpenAI(
    api_key=arcade_api_key,
    base_url=cloud_host,
)

chat_result = call_tool_with_openai(openai_client)
tool_call_id = chat_result.choices[0].message.tool_calls[0].id

content = json.loads(chat_result.choices[0].message.content)
base64_image = content[tool_call_id]["value"]["base64_image"]

image_data = base64.b64decode(base64_image)
with open("output_image.png", "wb") as image_file:
    image_file.write(image_data)

```
2024-11-15 13:29:52 -08:00

117 lines
2.8 KiB
Python

import arcade_code_sandbox
from arcade_code_sandbox.tools.e2b import create_static_matplotlib_chart, run_code
from arcade_code_sandbox.tools.models import E2BSupportedLanguage
from arcade.core.catalog import ToolCatalog
from arcade.sdk.eval import (
EvalRubric,
EvalSuite,
tool_eval,
)
from arcade.sdk.eval.critic import BinaryCritic, SimilarityCritic
merge_sort_code = """
def merge_sort(arr):
if len(arr) <= 1:
return arr
mid = len(arr) // 2
left = merge_sort(arr[:mid])
right = merge_sort(arr[mid:])
return merge(left, right)
def merge(left, right):
result = []
i, j = 0, 0
while i < len(left) and j < len(right):
if left[i] < right[j]:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1
result.extend(left[i:])
result.extend(right[j:])
return result
sample_list = ["banana", "apple", "cherry", "date", "elderberry"]
sorted_list = merge_sort(sample_list)
print("Sorted list:", sorted_list)
"""
matplotlib_chart_code = """
import matplotlib.pyplot as plt
labels = ['Apples', 'Bananas', 'Cherries', 'Dates']
sizes = [30, 25, 20, 25]
colors = ['red', 'yellow', 'purple', 'brown']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Fruit Distribution')
plt.savefig('fruit_pie_chart.png')
"""
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.85,
warn_threshold=0.95,
)
catalog = ToolCatalog()
catalog.add_module(arcade_code_sandbox)
@tool_eval()
def code_sandbox_eval_suite():
suite = EvalSuite(
name="code_sandbox Tools Evaluation",
system_message="You are an AI assistant with access to code_sandbox tools. Use them to help the user with their tasks.",
catalog=catalog,
rubric=rubric,
)
suite.add_case(
name="Run code",
user_message=f"Can you please run my merge sort algo?\n\n{merge_sort_code}",
expected_tool_calls=[
(
run_code,
{
"code": merge_sort_code,
"language": E2BSupportedLanguage.PYTHON,
},
)
],
critics=[
SimilarityCritic(critic_field="code", weight=0.8),
BinaryCritic(critic_field="language", weight=0.2),
],
)
suite.add_case(
name="Create static matplotlib chart",
user_message=f"Run this code:\n\n{matplotlib_chart_code}",
expected_tool_calls=[
(
create_static_matplotlib_chart,
{
"code": matplotlib_chart_code,
},
)
],
critics=[
SimilarityCritic(critic_field="code", weight=1.0),
],
)
return suite