1. New Eval SDK (`arcade/sdk/eval.py`): - Introduces `EvalSuite`, `EvalCase`, and `EvalRubric` classes for structured evaluation. - Implements various Critic classes (Binary, Numeric, Similarity) for flexible scoring. - Adds a `tool_eval` decorator for easy integration with existing tools. 2. CLI Integration (`arcade/cli/main.py` and `arcade/cli/utils.py`): - Adds an `evals` command to run evaluation suites from the CLI. - Implements result display functionality for evaluation outcomes. 3. Toolkit Updates: - Adds evaluation scripts for Gmail ([toolkits/gmail/evals/eval_gmail_tools.py](file:///Users/spartee/Dropbox/Arcade/platform/Team/arcade-ai/toolkits/gmail/evals/eval_gmail_tools.py#1%2C1-1%2C1)) and Slack ([toolkits/slack/evals/eval_slack_messaging.py](file:///Users/spartee/Dropbox/Arcade/platform/Team/arcade-ai/toolkits/slack/evals/eval_slack_messaging.py#1%2C1-1%2C1)) toolkits. - Demonstrates practical usage of the Eval SDK with real-world scenarios. 4. Miscellaneous: - Updates `arcade/cli/new.py` to optionally generate an `evals` directory for new toolkits. --------- Co-authored-by: Nate Barbettini <nate@arcade-ai.com>
133 lines
2.4 KiB
TOML
133 lines
2.4 KiB
TOML
[tool.poetry]
|
|
name = "arcade-ai"
|
|
version = "0.1.0"
|
|
description = ""
|
|
packages = [
|
|
{include="arcade", from="."}
|
|
]
|
|
authors = ["Arcade AI <sam@arcade-ai.com>"]
|
|
|
|
[build-system]
|
|
requires = ["poetry-core>=1.0.0"]
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
|
|
[tool.poetry.dependencies]
|
|
python = ">=3.10,<4.0"
|
|
pydantic = "^2.7.0"
|
|
pydantic-settings = "^2.2.1"
|
|
typer = "^0.9.0"
|
|
rich = "^13.7.1"
|
|
toml = "^0.10.2"
|
|
tomlkit = "^0.12.4"
|
|
requests = "^2.26.0" # TODO: is this really needed?
|
|
openai = "^1.36.0" # TODO: relax to an earlier version that still has what we need
|
|
pyjwt = "^2.8.0"
|
|
|
|
|
|
[tool.poetry.group.fastapi.dependencies]
|
|
fastapi = "^0.110.0"
|
|
|
|
[tool.poetry.group.flask.dependencies]
|
|
flask = "^3.0.3"
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
pytest = "^8.1.1"
|
|
pytest-cov = "^4.0.0"
|
|
mypy = "^1.5.1"
|
|
pre-commit = "^3.4.0"
|
|
tox = "^4.11.1"
|
|
pytest-asyncio = "^0.23.7"
|
|
types-toml = "^0.10.8"
|
|
uvicorn = "^0.22.0"
|
|
mkdocs = ">=1.5.2"
|
|
mkdocs-material = ">=9.3.0"
|
|
mkdocstrings = {extras = ["python"], version = ">=0.23.1"}
|
|
|
|
[tool.poetry.group.evals.dependencies]
|
|
scipy = "^1.14.0"
|
|
numpy = "^2.0.0"
|
|
scikit-learn = "^1.5.0"
|
|
|
|
[tool.poetry.scripts]
|
|
arcade = "arcade.cli.main:cli"
|
|
|
|
[tool.mypy]
|
|
files = ["arcade"]
|
|
python_version = "3.10"
|
|
disallow_untyped_defs = "True"
|
|
disallow_any_unimported = "True"
|
|
no_implicit_optional = "True"
|
|
check_untyped_defs = "True"
|
|
warn_return_any = "True"
|
|
warn_unused_ignores = "True"
|
|
show_error_codes = "True"
|
|
ignore_missing_imports = "True"
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = ["tests"]
|
|
|
|
[tool.ruff]
|
|
target-version = "py39"
|
|
line-length = 100
|
|
fix = true
|
|
select = [
|
|
# flake8-2020
|
|
"YTT",
|
|
# flake8-bandit
|
|
"S",
|
|
# flake8-bugbear
|
|
"B",
|
|
# flake8-builtins
|
|
"A",
|
|
# flake8-comprehensions
|
|
"C4",
|
|
# flake8-debugger
|
|
"T10",
|
|
# flake8-simplify
|
|
"SIM",
|
|
# isort
|
|
"I",
|
|
# mccabe
|
|
"C90",
|
|
# pycodestyle
|
|
"E", "W",
|
|
# pyflakes
|
|
"F",
|
|
# pygrep-hooks
|
|
"PGH",
|
|
# pyupgrade
|
|
"UP",
|
|
# ruff
|
|
"RUF",
|
|
# tryceratops
|
|
"TRY",
|
|
]
|
|
ignore = [ # TODO work to remove these
|
|
# LineTooLong
|
|
"E501",
|
|
# DoNotAssignLambda
|
|
"E731",
|
|
# raise from (cli specific)
|
|
"TRY200",
|
|
# Depends function in arg string
|
|
"B008",
|
|
# raise from (cli specific)
|
|
"B904",
|
|
# long message exceptions
|
|
"TRY003"
|
|
]
|
|
|
|
[tool.ruff.format]
|
|
preview = true
|
|
|
|
[tool.coverage.report]
|
|
skip_empty = true
|
|
|
|
[tool.coverage.run]
|
|
branch = true
|
|
source = ["arcade"]
|
|
|
|
|
|
[tool.ruff.per-file-ignores]
|
|
"tests/*" = ["S101"]
|