From 8f8721205834d5cc105fb445bff8fffb7db92184 Mon Sep 17 00:00:00 2001 From: Eric Gustin <34000337+EricGustin@users.noreply.github.com> Date: Thu, 24 Apr 2025 10:03:23 -0800 Subject: [PATCH] Notion Toolkit Optimizations (#379) 1. Paginate through all current level blocks before recursing into children (before this PR we would go back and forth between paginate and recurse) 2. For top-level blocks only, split blocks into 5 lists, and concurrently get their content --------- From my local timing benchmarks, this speeds up the tool call by ~60% (23 seconds to 9.1 seconds) for larger Notion pages Without optimization: Avg 22995 Attempt 1: 27503.49497795105 Attempt 2: 20863.977909088135 Attempt 3: 20888.309955596924 Attempt 4: 18574.61714744568 Attempt 5: 27147.75586128235 With optimization: Avg 9148.6 Attempt 1: 9941.372871398926 Attempt 2: 10097.685098648071 Attempt 3: 7855.895042419434 Attempt 4: 9078.719854354858 Attempt 5: 8772.69196510315 --- .../arcade_notion_toolkit/tools/pages.py | 73 ++++++++++++------- toolkits/notion/pyproject.toml | 2 +- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/toolkits/notion/arcade_notion_toolkit/tools/pages.py b/toolkits/notion/arcade_notion_toolkit/tools/pages.py index d98adb87..1e1a54cc 100644 --- a/toolkits/notion/arcade_notion_toolkit/tools/pages.py +++ b/toolkits/notion/arcade_notion_toolkit/tools/pages.py @@ -1,3 +1,4 @@ +import asyncio from typing import Annotated, Any import httpx @@ -29,47 +30,65 @@ async def get_page_content_by_id( async with httpx.AsyncClient() as client: - async def fetch_markdown_recursive(block_id: str, indent: str = "") -> str: - """ - Gets the markdown content of a Notion page. - - Performs DFS while paginating through the page's block children, converting - each block to markdown and conserving the page's indentation level. - """ - markdown_pieces = [] + async def fetch_blocks(block_id: str) -> list: + """Fetch all immediate children blocks for a given block ID, handling pagination""" + all_blocks = [] url = get_url("retrieve_block_children", block_id=block_id) cursor = None while True: data, has_more, cursor = await get_next_page(client, url, headers, params, cursor) - for block in data.get("results", []): - block_markdown = await converter.convert_block(block) - if block_markdown: - # Append each line with indent as a separate piece - for line in block_markdown.rstrip("\n").splitlines(): - markdown_pieces.append(indent + line + "\n") - - # If the block has children and is not a child page, recurse. - # We don't recurse into child page content, as this would result in fetching - # the children pages' content, which the Notion UI does not show. - if ( - block.get("has_children", False) - and block.get("type") != BlockType.CHILD_PAGE.value - ): - markdown_pieces.append( - await fetch_markdown_recursive(block["id"], indent + " ") - ) + all_blocks.extend(data.get("results", [])) if not has_more: break + return all_blocks + + async def process_blocks_to_markdown(blocks: list, indent: str = "") -> str: + """Process a list of blocks into markdown. + + If a block has children, we recurse into the children blocks. + """ + markdown_pieces = [] + + for block in blocks: + block_markdown = await converter.convert_block(block) + if block_markdown: + # Append each line with indent as a separate piece + for line in block_markdown.rstrip("\n").splitlines(): + markdown_pieces.append(indent + line + "\n") + + # If the block has children and is not a child page, recurse. + # We don't recurse into child page content, as this would result in fetching + # the children pages' content, which the Notion UI does not show. + if ( + block.get("has_children", False) + and block.get("type") != BlockType.CHILD_PAGE.value + ): + # Fetch all child blocks first + child_blocks = await fetch_blocks(block["id"]) + # Then process them all at once + child_markdown = await process_blocks_to_markdown(child_blocks, indent + " ") + markdown_pieces.append(child_markdown) + return "".join(markdown_pieces) # Get the title page_metadata = await get_object_metadata(context, object_id=page_id) markdown_title = f"# {extract_title(page_metadata)}\n" - # Get the content - markdown_content = await fetch_markdown_recursive(page_id, "") + # Get all top-level blocks + top_level_blocks = await fetch_blocks(page_id) + + chunk_size = max(1, len(top_level_blocks) // 5) + chunks = [ + top_level_blocks[i : i + chunk_size] + for i in range(0, len(top_level_blocks), chunk_size) + ] + + # Process all block content into markdown + results = await asyncio.gather(*[process_blocks_to_markdown(chunk, "") for chunk in chunks]) + markdown_content = "".join(results) return markdown_title + markdown_content diff --git a/toolkits/notion/pyproject.toml b/toolkits/notion/pyproject.toml index 87d98c72..04643ae8 100644 --- a/toolkits/notion/pyproject.toml +++ b/toolkits/notion/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "arcade_notion_toolkit" -version = "0.1.2" +version = "0.1.3" description = "Arcade.dev LLM tools for Notion" authors = ["ArcadeAI "]