Notion Toolkit Optimizations (#379)

1. Paginate through all current level blocks before recursing into
children (before this PR we would go back and forth between paginate and
recurse)
2. For top-level blocks only, split blocks into 5 lists, and
concurrently get their content


---------

From my local timing benchmarks, this speeds up the tool call by ~60%
(23 seconds to 9.1 seconds) for larger Notion pages


Without optimization: Avg 22995
Attempt 1: 27503.49497795105
Attempt 2: 20863.977909088135
Attempt 3: 20888.309955596924
Attempt 4: 18574.61714744568
Attempt 5: 27147.75586128235

With optimization: Avg 9148.6
Attempt 1: 9941.372871398926
Attempt 2: 10097.685098648071
Attempt 3: 7855.895042419434
Attempt 4: 9078.719854354858
Attempt 5: 8772.69196510315
This commit is contained in:
Eric Gustin 2025-04-24 10:03:23 -08:00 committed by GitHub
parent e46b985cc9
commit 8f87212058
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 47 additions and 28 deletions

View file

@ -1,3 +1,4 @@
import asyncio
from typing import Annotated, Any
import httpx
@ -29,47 +30,65 @@ async def get_page_content_by_id(
async with httpx.AsyncClient() as client:
async def fetch_markdown_recursive(block_id: str, indent: str = "") -> str:
"""
Gets the markdown content of a Notion page.
Performs DFS while paginating through the page's block children, converting
each block to markdown and conserving the page's indentation level.
"""
markdown_pieces = []
async def fetch_blocks(block_id: str) -> list:
"""Fetch all immediate children blocks for a given block ID, handling pagination"""
all_blocks = []
url = get_url("retrieve_block_children", block_id=block_id)
cursor = None
while True:
data, has_more, cursor = await get_next_page(client, url, headers, params, cursor)
for block in data.get("results", []):
block_markdown = await converter.convert_block(block)
if block_markdown:
# Append each line with indent as a separate piece
for line in block_markdown.rstrip("\n").splitlines():
markdown_pieces.append(indent + line + "\n")
# If the block has children and is not a child page, recurse.
# We don't recurse into child page content, as this would result in fetching
# the children pages' content, which the Notion UI does not show.
if (
block.get("has_children", False)
and block.get("type") != BlockType.CHILD_PAGE.value
):
markdown_pieces.append(
await fetch_markdown_recursive(block["id"], indent + " ")
)
all_blocks.extend(data.get("results", []))
if not has_more:
break
return all_blocks
async def process_blocks_to_markdown(blocks: list, indent: str = "") -> str:
"""Process a list of blocks into markdown.
If a block has children, we recurse into the children blocks.
"""
markdown_pieces = []
for block in blocks:
block_markdown = await converter.convert_block(block)
if block_markdown:
# Append each line with indent as a separate piece
for line in block_markdown.rstrip("\n").splitlines():
markdown_pieces.append(indent + line + "\n")
# If the block has children and is not a child page, recurse.
# We don't recurse into child page content, as this would result in fetching
# the children pages' content, which the Notion UI does not show.
if (
block.get("has_children", False)
and block.get("type") != BlockType.CHILD_PAGE.value
):
# Fetch all child blocks first
child_blocks = await fetch_blocks(block["id"])
# Then process them all at once
child_markdown = await process_blocks_to_markdown(child_blocks, indent + " ")
markdown_pieces.append(child_markdown)
return "".join(markdown_pieces)
# Get the title
page_metadata = await get_object_metadata(context, object_id=page_id)
markdown_title = f"# {extract_title(page_metadata)}\n"
# Get the content
markdown_content = await fetch_markdown_recursive(page_id, "")
# Get all top-level blocks
top_level_blocks = await fetch_blocks(page_id)
chunk_size = max(1, len(top_level_blocks) // 5)
chunks = [
top_level_blocks[i : i + chunk_size]
for i in range(0, len(top_level_blocks), chunk_size)
]
# Process all block content into markdown
results = await asyncio.gather(*[process_blocks_to_markdown(chunk, "") for chunk in chunks])
markdown_content = "".join(results)
return markdown_title + markdown_content

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "arcade_notion_toolkit"
version = "0.1.2"
version = "0.1.3"
description = "Arcade.dev LLM tools for Notion"
authors = ["ArcadeAI <dev@arcade.dev>"]