4 simple flows working

This commit is contained in:
Sam Partee 2024-05-08 21:25:23 -07:00
parent ea708d62f0
commit 6272a426f1
12 changed files with 484 additions and 212 deletions

View file

@ -11,4 +11,6 @@ email = "sam@partee.io"
SendEmail = "gmailer.send_email@0.1.0"
ReadEmail = "gmailer.read_email@0.1.0"
PlotDataframe = "gmailer.plot_dataframe@0.1.0"
ReadSqlite = "read_sqlite.read_sqlite@0.1.0"
Summarize = "chat.summarize@0.1.0"
VectorSearch = "search.vector_search@0.1.0"

View file

@ -10,3 +10,5 @@ email = "sam@partee.io"
[modules]
gmailer = "0.1.0"
chat = "0.1.0"
search = "0.1.0"
read_sqlite = "0.1.0"

View file

@ -7,8 +7,8 @@ import openai
@tool
async def summarize(
#text: Param(str, "Text to summarize"),
data_id: Param(int, "ID of the data to summarize"),
text: Param(str, "Text to summarize"),
#data_id: Param(int, "ID of the data to summarize"),
system_prompt: Param(str, "System prompt to use") = "Summarize the following text",
max_tokens: Param(int, "Maximum number of tokens to generate") = 1000,
) -> Param(str, "Summarized text"):
@ -21,11 +21,15 @@ async def summarize(
Returns:
str: The summarized text.
"""
df = await get_df(data_id)
text = df.to_json(orient='records')
#df = await get_df(data_id)
#text = df.to_json(orient='records')
api_key = get_secret("openai_api_key", None)
model = get_secret("openai_model_summarize", "gpt-4-turbo")
# Call the OpenAI model with the tools and messages
if isinstance(text, list):
text = "\n".join(text)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},

View file

@ -48,9 +48,9 @@ async def send_email(
@tool
async def read_email(
output_name: Param(str, "Name of the output data"),
#output_name: Param(str, "Name of the output data"),
n_emails: Param(int, "Number of emails to read") = 5,
):
) -> Param(str, "emails"):
"""Read emails from a Gmail account and extract plain text content, removing any HTML."""
email_address = get_secret("gmail_email")
@ -70,31 +70,37 @@ async def read_email(
emails = []
for email_id in email_ids[:n_emails]:
result, data = mail.fetch(email_id, "(RFC822)")
raw_email = data[0][1]
msg = email.message_from_bytes(raw_email)
try:
result, data = mail.fetch(email_id, "(RFC822)")
raw_email = data[0][1]
msg = email.message_from_bytes(raw_email)
email_details = {
"from": msg["From"],
"to": msg["To"],
"date": msg["Date"]
}
email_details = {
"from": msg["From"],
"to": msg["To"],
"date": msg["Date"]
}
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True).decode('utf-8')
email_details["body"] = clean_email_body(body)
else:
body = msg.get_payload(decode=True).decode('utf-8')
email_details["body"] = clean_email_body(body)
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True).decode('utf-8')
email_details["body"] = clean_email_body(body)
else:
body = msg.get_payload(decode=True).decode('utf-8')
email_details["body"] = clean_email_body(body)
except Exception as e:
print(f"Error reading email {email_id}: {e}")
continue
emails.append(email_details)
mail.close()
mail.logout()
df = pd.DataFrame(emails)
await save_df(df, output_name)
#df = pd.DataFrame(emails)
#await save_df(df, output_name)
data = "\n".join([f"{email['from']} - {email['date']}\n{email['body']}\n" for email in emails])
return data

View file

@ -0,0 +1,43 @@
from toolserve.sdk import Param, tool, get_secret
from toolserve.sdk.dataframe import save_df
import pandas as pd
from sqlite3 import connect
@tool
async def read_sqlite(
file_path: Param(str, "Path to the SQLite database file"),
table_name: Param(str, "Name of the table to read from"),
output_name: Param(str, "Name of the output data to save"),
) -> Param(str, "Output data name"):
"""Read data from a SQLite database table and save it as a DataFrame.
Args:
file_path (str): Path to the SQLite database file.
table_name (str): Name of the table to read from.
output_name (str): Name of the output data to save.
Returns:
str: Name of the output data.
"""
# Connect to the SQLite database
conn = connect(file_path)
cursor = conn.cursor()
# Read the data from the table
query = f"SELECT * FROM {table_name}"
cursor.execute(query)
rows = cursor.fetchall()
# Get the column names
cursor.execute(f"PRAGMA table_info({table_name})")
columns = [col[1] for col in cursor.fetchall()]
# Create a DataFrame from the data
df = pd.DataFrame(rows, columns=columns)
# Save the DataFrame
await save_df(df, output_name)
return output_name

View file

@ -0,0 +1,65 @@
import faiss
import numpy as np
from typing import List
from fastembed import TextEmbedding
from toolserve.sdk import Param, tool, get_secret
from toolserve.sdk.dataframe import get_df
@tool
async def vector_search(
data_id: Param(int, "The ID of the data source containing the documents"),
query: Param(str, "The text to find within the documents"),
column_name: Param(str, "The name of the column containing the documents"),
n_results: Param(int, "The number of top results to return") = 5
) -> Param(List[str], "The documents most similar to the query"):
"""Create a FAISS index from a list of documents and search for the query, returning the most similar documents.
Args:
query (str): The text query to search for. Should be written like a document.
column_name (str): The name of the column containing the documents.
n_results (int, optional): The number of top results to return. Defaults to 5.
Returns:
List[str]: The documents most similar to the query based on the search.
"""
# Get the data
df = await get_df(data_id)
docs = df[column_name].tolist()
# Initialize the embedding model
embedding_tool = TextEmbedding()
# Embed all documents
embeddings = []
for doc in docs:
# Get the generator from the embed method
doc_embedding_generator = embedding_tool.embed([doc])
# Convert the generator to a list and take the first element
doc_embedding = list(doc_embedding_generator)[0]
embeddings.append(doc_embedding)
# Convert list of embeddings to a numpy array and ensure type float32
embeddings = np.vstack(embeddings).astype('float32')
# Create a flat L2 index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings) # Add embeddings to the index
# Embed the query
query_embedding_generator = embedding_tool.embed([query])
query_embedding = list(query_embedding_generator)[0]
query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)
# Search the index
distances, indices = index.search(query_embedding, n_results)
# Fetch the documents corresponding to the top indices
top_docs = [docs[i] for i in indices.flatten().tolist()]
return top_docs

View file

@ -2,7 +2,7 @@ import httpx
import json
import time
import openai
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple, Union
from pydantic import BaseModel
from typing import List, Dict
@ -17,30 +17,77 @@ from typing import Dict, Any, Optional
import json
from collections import deque
def pydantic_to_openai_tool(model: Type[BaseModel]) -> str:
"""
Convert a Pydantic model to an OpenAI tool schema.
Args:
model (Type[BaseModel]): The Pydantic model to convert.
Returns:
str: The OpenAI tool schema.
"""
schema = model_to_json_schema(model)
tool_schema = {
"type": "function",
"function": {
"name": model.__name__,
"description": model.__doc__ or "",
"parameters": schema
}
}
return json.dumps(tool_schema)
class Edge(BaseModel):
source: int = Field(..., description="The ID of the source node")
target: int = Field(..., description="The ID of the target node")
class ToolNode(BaseModel):
node_id: int = Field(..., description="The ID of the node", ge=0)
input_name: Optional[str] = Field(None, description="The name of the input data")
tool_name: str = Field(..., description="The name of the tool to execute")
output_name: Optional[str] = Field(None, description="The name of the output data")
predict_args: bool = Field(True, description="Whether to predict the arguments for the tool")
from_node: Optional[Dict[str, int]] = Field(None, description="The ID of the source node name of the argument to pass to the tool")
args: Optional[Dict[str, Any]] = Field(None, description="The arguments to pass to the tool")
class OutputType(Enum):
DATA = "data"
CHAT = "chat"
ARTIFACT = "artifact"
class FlowSchema(BaseModel):
"""A graph based representation of functions (nodes), and their data flow (edges)"""
nodes: List[ToolNode] = Field(..., description="The nodes in the flow")
edges: List[Edge] = Field([], description="The IDs of the adjacent nodes")
output_type: OutputType = Field(OutputType.CHAT, description="The type of the output")
class Config:
arbitrary_types_allowed = True
use_enum_values = True
class ToolClient:
available_tools = {
"query_sql": "/tool/query/query_sql",
"list_data_sources": "/tool/query/list_data_sources",
"get_data_schema": "/tool/query/get_data_schema",
"PlotDataframe": "/tool/gmailer/PlotDataframe",
"ReadEmail": "/tool/gmailer/ReadEmail",
"Summarize": "/tool/chat/Summarize",
}
def __init__(self, base_url: str):
self.base_url = base_url
self.client = httpx.Client(timeout=30)
self.tools = self.__collect_tool_specs()
self.client = httpx.Client(timeout=3000)
tools, routes = self.__collect_tool_specs()
self.tools = tools
self.available_tools = routes
def __collect_tool_specs(self) -> Dict[str, str]:
tools_list = self.call_api("GET", "/api/v1/tools/list").get("data", {})
all_tools = [tool["name"] for tool in tools_list]
routes = {tool["name"]: tool["endpoint"] for tool in tools_list}
tools = {}
for tool_name, endpoint in self.available_tools.items():
for tool_name, endpoint in routes.items():
openai_spec = self.call_api("GET", "/api/v1/tools/oai_function", params={"tool_name": tool_name}).get("data", {})
tools[tool_name] = openai_spec
return tools
return tools, routes
def call_api(self, method: str, endpoint: str, params: dict = {}, data: dict = {}, json_data: dict = {}) -> Dict[str, Any]:
"""Call the Darkstar Toolserver API with the given parameters.
@ -173,7 +220,6 @@ class ToolRunner:
raise ValueError(f"Tool '{tool_name}' not found in available tools.")
tool = json.loads(func_spec)
print(tool)
# Call the OpenAI model with the tools and messages
completion = self._openai_client.chat.completions.create(
model="gpt-4-turbo",
@ -198,26 +244,37 @@ class ToolRunner:
if "output_name" in args and output_name != "None":
args["output_name"] = output_name
if "data_id" in args:
args["data_id"] = self._data_id
return args
def run_tool(self, tool_name: str, user_query: str, source: str, output_name: str) -> Any:
def run_tool(self, tool: ToolNode, user_query: str, **kwargs) -> Any:
"""
Executes an tool using the Darkstar Toolserver API and an OpenAI model.
:param tool_name: The name of the tool to execute.
:param user_query: The user query to provide to the model.
:return: The result of the tool
"""
source = None
if tool.input_name:
source = tool.input_name
self.set_source(source)
print(f"Tool Name: {tool_name}")
print(f"Data ID: {self._data_id}")
print(f"Sourcing data from {source}")
messages = self.__create_prompt(user_query, source, output_name)
tool_args = self.get_tool_args(tool_name, messages, output_name)
result = self._client.execute_tool(tool_name, tool_args)
if tool.predict_args:
messages = self.__create_prompt(user_query, source, tool.output_name)
tool_args = self.get_tool_args(tool.tool_name, messages, tool.output_name)
elif tool.from_node:
# todo change to list
tool_args = kwargs.get("tool_args", {})
else:
tool_args = {}
# TODO would something ever have an input_name and not need a data_id?
if tool.input_name:
tool_args["data_id"] = self._data_id
if tool.args:
tool_args.update(tool.args)
print("Calling tool with args:", tool_args)
result = self._client.execute_tool(tool.tool_name, tool_args)
return result
def get_data_object(self, data_id: int) -> Dict[str, Any]:
@ -230,63 +287,10 @@ class ToolRunner:
return self._client.call_api("GET", f"/api/v1/data/object/{data_id}")["data"]["json_blob"]
def pydantic_to_openai_tool(model: Type[BaseModel]) -> str:
"""
Convert a Pydantic model to an OpenAI tool schema.
Args:
model (Type[BaseModel]): The Pydantic model to convert.
Returns:
str: The OpenAI tool schema.
"""
schema = model_to_json_schema(model)
tool_schema = {
"type": "function",
"function": {
"name": model.__name__,
"description": model.__doc__ or "",
"parameters": schema
}
}
return json.dumps(tool_schema)
class Edge(BaseModel):
source: int = Field(..., description="The ID of the source node")
target: int = Field(..., description="The ID of the target node")
class ToolNode(BaseModel):
node_id: int = Field(..., description="The ID of the node", ge=0)
input_name: Optional[str] = Field(None, description="The name of the input data")
tool_name: str = Field(..., description="The name of the tool to execute")
output_name: Optional[str] = Field(..., description="The name of the output data")
class OutputType(Enum):
DATA = "data"
CHAT = "chat"
ARTIFACT = "artifact"
class FlowSchema(BaseModel):
"""A graph based representation of functions (nodes), and their data flow (edges)"""
nodes: List[ToolNode] = Field(..., description="The nodes in the flow")
edges: List[Edge] = Field([], description="The IDs of the adjacent nodes")
output_type: OutputType = Field(OutputType.CHAT, description="The type of the output")
class Config:
arbitrary_types_allowed = True
use_enum_values = True
class ToolFlow:
tools = {
"query_sql": (OutputType.DATA, True, False),
"PlotDataframe": (OutputType.ARTIFACT, False, True),
"ReadEmail": (OutputType.CHAT, True, False),
"Summarize": (OutputType.CHAT, False, True),
}
def __init__(
self,
name: str,
@ -304,24 +308,6 @@ class ToolFlow:
self.openai_client = openai.Client(api_key=model_api_key)
def __create_prompt(self, user_query: str) -> List[Dict[str, str]]:
tool_list = ""
for tool, spec in self.tools.items():
tool_list += f"- Name: {tool}\n"
tool_list += f" - Output Type: {spec[0].value}\n"
tool_list += f" - Can be source node: {spec[1]}\n"
tool_list += f" - Can be sink node: {spec[2]}\n"
source_list = "\n".join(self.runner._data_sources.keys())
prompt = self.prompt.format(nodes=tool_list, sources=source_list)
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": user_query}
]
return messages
def infer_flow(self, user_query: str) -> FlowSchema:
"""
Infer the tool flow based on the user query.
@ -364,10 +350,17 @@ class ToolFlow:
# Initialize a queue for BFS
execution_queue = deque([flow_schema['nodes'][0]]) # Start BFS from the source node
# Queue up all nodes which don't have incoming edges
incoming_edges = {node['node_id']: 0 for node in flow_schema['nodes']}
for edge in flow_schema.get('edges', []):
incoming_edges[edge['target']] += 1
execution_queue = deque([node for node in flow_schema['nodes'] if incoming_edges[node['node_id']] == 0])
visited = set()
results = {}
timings = {}
flow_start_time = time.time()
while execution_queue:
current_node = execution_queue.popleft()
node_id = current_node['node_id']
@ -376,14 +369,22 @@ class ToolFlow:
continue
visited.add(node_id)
exec_start_time = time.time()
# Execute the current node's operation using runner.run_tool
operation_result = self.runner.run_tool(
current_node['tool_name'],
user_query,
current_node['input_name'],
current_node['output_name']
)
current_tool = ToolNode(**current_node)
if current_tool.from_node:
tool_args = {}
for arg_name, from_node_id in current_tool.from_node.items():
from_node_result = results[from_node_id]["data"]["result"]
tool_args[arg_name] = from_node_result
operation_result = self.runner.run_tool(current_tool, user_query, tool_args=tool_args)
else:
operation_result = self.runner.run_tool(current_tool, user_query)
results[node_id] = operation_result
exec_end_time = time.time()
timings[current_tool.tool_name] = exec_end_time - exec_start_time
# Enqueue all adjacent nodes
for edge in flow_schema.get('edges', []):
@ -397,7 +398,13 @@ class ToolFlow:
sink_node = flow_schema['nodes'][-1]
sink_tool_name = sink_node['tool_name']
sink_node_id = sink_node['node_id']
sink_output_type = self.tools[sink_tool_name][0]
# TODO: Tools need to specify output type
#sink_output_type = self.tools[sink_tool_name][0]
sink_output_type = OutputType(flow_schema['output_type'])
flow_end_time = time.time()
timings['total'] = flow_end_time - flow_start_time
if sink_output_type == OutputType.DATA:
data = self.runner.get_data_object(self.runner._data_id)
elif sink_output_type == OutputType.CHAT:
@ -405,59 +412,7 @@ class ToolFlow:
else:
data = results[sink_node_id]
return (data, results, sink_output_type)
def summarize_flow_results(model_client, flow_results: Dict[str, Any], flow_schema) -> str:
"""
Summarizes the results of a tool flow execution using an OpenAI model to generate a chat response.
Args:
model_client (openai.Client): The OpenAI client to use for generating chat responses.
flow_results (Dict[str, Any]): The results of the tool flow execution.
flow_schema (Dict[str, Any]): The schema representing the tool flow.
Returns:
Dict[str, str]: A dictionary containing the chat response under the key "data".
"""
try:
# Check if flow_results is already a JSON string, otherwise convert it
if isinstance(flow_results, str):
flow_summary = flow_results
else:
flow_summary = json.dumps(flow_results, indent=2)
# Construct a concise and informative prompt for the chat model
prompt_content = dedent(f"""
Please review the tool execution results and the flow schema provided below.
Use the results of the final tool to describe the outcomes. Be concise and only use the provided information.
If the results seem incorrect or incomplete, kindly ask the user to reformulate their query for better accuracy.
The execution path, expressed a a JSON object where nodes represent tools and edges represent data flow:
{flow_schema}
The results of the execution, expressed as a JSON object:
{flow_summary}
""")
messages = [
{"role": "system", "content": prompt_content}
]
# Call the OpenAI chat model
response = model_client.chat.completions.create(
model="gpt-4-turbo",
messages=messages
)
# Extract the chat response
chat_response = response.choices[0].message.content
return chat_response
except Exception as e:
print(f"Error in summarizing flow results: {e}")
return "Error: Failed to generate summary due to an internal error."
return (data, results, sink_output_type, timings)
@ -474,7 +429,7 @@ plotting_flow = FlowSchema(
)
email_flow = FlowSchema(
email_flow_1 = FlowSchema(
nodes=[
ToolNode(node_id=0, input_name=None, tool_name="ReadEmail", output_name="email_data_1"),
ToolNode(node_id=1, input_name="email_data_1", tool_name="Summarize", output_name=None),
@ -485,11 +440,68 @@ email_flow = FlowSchema(
output_type=OutputType.CHAT
)
email_flow = FlowSchema(
nodes=[
ToolNode(node_id=0, tool_name="ReadEmail"),
ToolNode(node_id=1, tool_name="Summarize", from_node={"text": 2}, predict_args=False),
],
edges=[
Edge(source=0, target=1)
],
output_type=OutputType.CHAT
)
class Agent:
review_db = "/Users/spartee/Dropbox/Arcade/platform/toolserver/examples/data/food-reviews/database.sqlite"
review_flow = FlowSchema(
nodes=[
ToolNode(node_id=0, tool_name="ReadSqlite", args={"table_name": "Reviews", "file_path": review_db, "output_name": "reviews"}, predict_args=False),
ToolNode(node_id=1, input_name="reviews", tool_name="query_sql", output_name="review_data"),
ToolNode(node_id=2, input_name="review_data", tool_name="search_text_columns"),
ToolNode(node_id=3, tool_name="Summarize", from_node={"text": 2}, predict_args=False),
],
edges=[
Edge(source=0, target=1),
Edge(source=1, target=2),
Edge(source=2, target=3)
],
output_type=OutputType.CHAT
)
def __init__(self, flows: Dict[str, FlowSchema]):
self.flows = flows
shopify_db = "/Users/spartee/Dropbox/Arcade/platform/toolserver/examples/data/olist.sqlite"
customer_flow = FlowSchema(
nodes=[
ToolNode(node_id=0, tool_name="ReadSqlite", args={"table_name": "customers", "file_path": shopify_db, "output_name": "customers"}, predict_args=False),
ToolNode(node_id=1, tool_name="ReadSqlite", args={"table_name": "orders", "file_path": shopify_db, "output_name": "all_customer_orders"}, predict_args=False),
ToolNode(node_id=2, input_name="customers", tool_name="query_sql", output_name="customer_data"),
ToolNode(node_id=3, input_name="all_customer_orders", tool_name="query_sql", output_name="customer_orders"),
ToolNode(node_id=4, input_name="customer_data", tool_name="get"),
ToolNode(node_id=5, input_name="customer_orders", tool_name="get"),
ToolNode(node_id=6, tool_name="combine_results", from_node={"result_1": 4, "result_2": 5}, predict_args=False),
ToolNode(node_id=7, tool_name="Summarize", from_node={"text": 6}, predict_args=False)
],
edges=[
Edge(source=0, target=2),
Edge(source=1, target=3),
Edge(source=2, target=4),
Edge(source=3, target=5),
Edge(source=4, target=6),
Edge(source=5, target=6),
Edge(source=6, target=7)
],
output_type=OutputType.CHAT
)
def print_flow_as_yaml(data: Dict[str, Any]):
data_dict = data.dict(exclude_unset=True) if isinstance(data, BaseModel) else data
# Convert the dictionary to a YAML formatted string
yaml_str = yaml.dump(data_dict, sort_keys=False)
# Print the YAML string
print(yaml_str)

View file

@ -21,7 +21,7 @@ from streamlit_chat import message
import streamlit.components.v1 as components
from textwrap import dedent
import plotly.express as px
from agent import ToolFlow, email_flow, plotting_flow
from agent import ToolFlow, email_flow, plotting_flow, review_flow, customer_flow
@ -67,8 +67,9 @@ def plot_flow(data: Dict[str, Any]):
# Check if there are any nodes to determine a start node for bfs_layout
if G.nodes:
start_node = next(iter(G.nodes)) # Get an arbitrary start node
pos = nx.bfs_layout(G, start_node)
#start_node = next(iter(G.nodes)) # Get an arbitrary start node
#pos = nx.bfs_layout(G, start_node)
pos = nx.spring_layout(G)
else:
pos = {}
@ -94,7 +95,7 @@ def get_agent():
# From here down is all the StreamLit UI.
st.set_page_config(page_title="Arcade AI Demo", page_icon=":robot:", layout="wide")
dropdown_options = ["Gmailer", "PlotBot"]
dropdown_options = ["Gmailer", "PlotBot", "ReviewChat", "CustomerService"]
selected_option = st.sidebar.selectbox("Select an App:", dropdown_options)
st.sidebar.write(f"Selected App: {selected_option}")
@ -132,6 +133,10 @@ def submit():
json_flow = email_flow.dict()
elif selected_option == "PlotBot":
json_flow = plotting_flow.dict()
elif selected_option == "ReviewChat":
json_flow = review_flow.dict()
elif selected_option == "CustomerService":
json_flow = customer_flow.dict()
else:
st.error("Invalid option selected")
return
@ -160,20 +165,27 @@ if st.session_state["generated"]:
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
result = st.session_state["generated"][i]
res, all_results, output_type = result
result_tab, all_results_tab, times_tab = st.tabs(["Result", "All Results", "Execution Times"])
res, all_results, output_type, timings = result
with all_results_tab:
st.write(all_results)
with times_tab:
st.write(timings)
with result_tab:
output_type = output_type.value
if output_type == "artifact":
# plot the json returned in res
fig_json = res["data"]["result"]
# plot the json with ploylu atream lit
st.plotly_chart(json.loads(fig_json))
elif output_type == "chat":
st.write(res)
elif output_type == "data":
json_res = json.loads(res)["data"]
st.dataframe(json_res)
else:
st.error("Returned result:")
st.error(res)
output_type = output_type.value
if output_type == "artifact":
# plot the json returned in res
fig_json = res["data"]["result"]
# plot the json with ploylu atream lit
st.plotly_chart(json.loads(fig_json))
elif output_type == "chat":
st.write(res)
elif output_type == "data":
json_res = json.loads(res)["data"]
st.dataframe(json_res)
else:
st.error("Returned result:")
st.error(res)

View file

@ -0,0 +1,107 @@
from typing import List, Dict, Any
from toolserve.sdk.dataframe import get_df, save_df
from toolserve.sdk.tool import tool, Param
@tool
async def get(
data_id: Param(int, "ID of the data")
) -> Param(str, "data"):
"""Get data by ID"""
df = await get_df(data_id)
return df.to_json(orient='records')
@tool
async def select_columns(
data_id: Param(int, "ID of the data"),
columns: Param(List[str], "Columns to select")
) -> Param(str, "data"):
"""Select columns from a DataFrame"""
df = await get_df(data_id)
df = df[columns]
return df.to_json(orient='records')
@tool
async def filter_rows(
data_id: Param(int, "ID of the data"),
column: Param(str, "Column to filter"),
value: Param(str, "Value to filter by")
) -> Param(str, "data"):
"""Filter rows in a DataFrame"""
df = await get_df(data_id)
df = df[df[column] == value]
return df.to_json(orient='records')
@tool
async def sort(
data_id: Param(int, "ID of the data"),
column: Param(str, "Column to sort by"),
ascending: Param(bool, "Sort ascending or descending") = True
) -> Param(str, "data"):
"""Sort a DataFrame by a column"""
df = await get_df(data_id)
df = df.sort_values(by=column, ascending=ascending)
return df.to_json(orient='records')
@tool
async def group_by(
data_id: Param(int, "ID of the data"),
columns: Param(List[str], "Columns to group by"),
aggregations: Param(Dict[str, str], "Aggregations to perform")
) -> Param(str, "data"):
"""Group by columns and perform aggregations"""
df = await get_df(data_id)
df = df.groupby(columns).agg(aggregations)
return df.to_json(orient='records')
@tool
async def join(
data_id1: Param(int, "ID of the first data"),
data_id2: Param(int, "ID of the second data"),
on: Param(str, "Column to join on"),
how: Param(str, "Type of join") = "inner"
) -> Param(str, "data"):
"""Join two DataFrames"""
df1 = await get_df(data_id1)
df2 = await get_df(data_id2)
df = df1.merge(df2, on=on, how=how)
return df.to_json(orient='records')
@tool
async def search_text_columns(
data_id: Param(int, "ID of the data"),
query: Param(str, "Text to search for"),
column: Param(str, "Column to search in"),
max_rows: Param(int, "Maximum number of rows to return") = 50
) -> Param(str, "data"):
"""Search text in columns
Search for a text query in a specific column of a DataFrame.
Args:
data_id (int): The ID of the data source to search in.
query (str): The text to search for.
column (str): The column to search in.
Returns:
str: The data source after filtering for the text query, limited to a maximum number of rows.
"""
df = await get_df(data_id)
# Ensure the column data is treated as string
df[column] = df[column].astype(str)
# Use regex=False to treat the query as a literal string, avoiding any regex special character issues
mask = df[column].str.contains(query, case=False, na=False, regex=False)
df = df[mask]
# Limit the number of rows returned
df = df.head(max_rows)
return df.to_json(orient='records')
@tool
def combine_results(
result_1: Param(str, "First result"),
result_2: Param(str, "Second result")
) -> Param(str, "data"):
"""Combine two results"""
return str(result_1) + str(result_2)

View file

@ -53,10 +53,13 @@ async def query_sql(
"""Query a data source using SQL
The SQL query should be parameterized with DuckDB's syntax. For example, to query a
DataFrame named `df` with a parameter `param`, the query should be `SELECT * FROM df WHERE column = ?`.
DataFrame named `df` with a parameter `param`, the query should be `SELECT * FROM df WHERE ? = ?`.
The list of params should be in order of the parameters in the SQL query.
IMPORTANT: There should be no parameters in the query.
For example: `SELECT * FROM df WHERE name = ?` should be `SELECT * FROM df WHERE ? = ?`.
After the query, a new data source at a new id will be created with the results and
the schema of the data source will be returned.
@ -71,7 +74,6 @@ async def query_sql(
"""
try:
# Retrieve the DataFrame and execute the SQL query using DuckDB
import duckdb
df = await get_df(data_id)
con = duckdb.connect(database=':memory:', read_only=False)
con.register('df_table', df)
@ -88,10 +90,12 @@ async def query_sql(
except Exception as e:
# Log the error and raise an exception
await log(f"Failed to execute query: {str(e)}", level="ERROR")
log_message = f"Failed to execute query: {str(e)}."
log_message += f" -- SQL: {sql}"
log_message += f" -- Parameters: {params}"
await log(log_message, level="ERROR")
raise RuntimeError(f"Query execution failed: {str(e)}")
def get_df_info(df: pd.DataFrame, data_id: Optional[int]=None) -> Dict[str, Union[int, str]]:
"""
Generate a compact string representation of a DataFrame including the count of columns,

View file

@ -120,7 +120,14 @@ class ToolCatalog:
return None
def list_tools(self) -> List[Dict[str, str]]:
return [{'name': t.name, 'description': t.description} for t in self.tools.values()]
def get_tool_endpoint(t: ToolSchema) -> str:
return f"/tool/{t.meta.module}/{t.name}"
return [
{'name': t.name,
'description': t.description,
'endpoint': get_tool_endpoint(t)
} for t in self.tools.values()]

View file

@ -24,6 +24,14 @@ class Settings(BaseSettings):
"query.list_data_sources@builtin",
"query.get_data_schema@builtin",
"query.query_sql@builtin",
"data.get@builtin",
"data.select_columns@builtin",
"data.filter_rows@builtin",
"data.sort@builtin",
"data.group_by@builtin",
"data.join@builtin",
"data.search_text_columns@builtin",
"data.combine_results@builtin",
]
# Env Config