From 8f8721205834d5cc105fb445bff8fffb7db92184 Mon Sep 17 00:00:00 2001
From: Eric Gustin <34000337+EricGustin@users.noreply.github.com>
Date: Thu, 24 Apr 2025 10:03:23 -0800
Subject: [PATCH] Notion Toolkit Optimizations (#379)

1. Paginate through all current level blocks before recursing into
children (before this PR we would go back and forth between paginate and
recurse)
2. For top-level blocks only, split blocks into 5 lists, and
concurrently get their content


---------

From my local timing benchmarks, this speeds up the tool call by ~60%
(23 seconds to 9.1 seconds) for larger Notion pages


Without optimization: Avg 22995
Attempt 1: 27503.49497795105
Attempt 2: 20863.977909088135
Attempt 3: 20888.309955596924
Attempt 4: 18574.61714744568
Attempt 5: 27147.75586128235

With optimization: Avg 9148.6
Attempt 1: 9941.372871398926
Attempt 2: 10097.685098648071
Attempt 3: 7855.895042419434
Attempt 4: 9078.719854354858
Attempt 5: 8772.69196510315
---
 .../arcade_notion_toolkit/tools/pages.py      | 73 ++++++++++++-------
 toolkits/notion/pyproject.toml                |  2 +-
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/toolkits/notion/arcade_notion_toolkit/tools/pages.py b/toolkits/notion/arcade_notion_toolkit/tools/pages.py
index d98adb87..1e1a54cc 100644
--- a/toolkits/notion/arcade_notion_toolkit/tools/pages.py
+++ b/toolkits/notion/arcade_notion_toolkit/tools/pages.py
@@ -1,3 +1,4 @@
+import asyncio
 from typing import Annotated, Any
 
 import httpx
@@ -29,47 +30,65 @@ async def get_page_content_by_id(
 
     async with httpx.AsyncClient() as client:
 
-        async def fetch_markdown_recursive(block_id: str, indent: str = "") -> str:
-            """
-            Gets the markdown content of a Notion page.
-
-            Performs DFS while paginating through the page's block children, converting
-            each block to markdown and conserving the page's indentation level.
-            """
-            markdown_pieces = []
+        async def fetch_blocks(block_id: str) -> list:
+            """Fetch all immediate children blocks for a given block ID, handling pagination"""
+            all_blocks = []
             url = get_url("retrieve_block_children", block_id=block_id)
             cursor = None
 
             while True:
                 data, has_more, cursor = await get_next_page(client, url, headers, params, cursor)
-                for block in data.get("results", []):
-                    block_markdown = await converter.convert_block(block)
-                    if block_markdown:
-                        # Append each line with indent as a separate piece
-                        for line in block_markdown.rstrip("\n").splitlines():
-                            markdown_pieces.append(indent + line + "\n")
-
-                    # If the block has children and is not a child page, recurse.
-                    # We don't recurse into child page content, as this would result in fetching
-                    # the children pages' content, which the Notion UI does not show.
-                    if (
-                        block.get("has_children", False)
-                        and block.get("type") != BlockType.CHILD_PAGE.value
-                    ):
-                        markdown_pieces.append(
-                            await fetch_markdown_recursive(block["id"], indent + "    ")
-                        )
+                all_blocks.extend(data.get("results", []))
                 if not has_more:
                     break
 
+            return all_blocks
+
+        async def process_blocks_to_markdown(blocks: list, indent: str = "") -> str:
+            """Process a list of blocks into markdown.
+
+            If a block has children, we recurse into the children blocks.
+            """
+            markdown_pieces = []
+
+            for block in blocks:
+                block_markdown = await converter.convert_block(block)
+                if block_markdown:
+                    # Append each line with indent as a separate piece
+                    for line in block_markdown.rstrip("\n").splitlines():
+                        markdown_pieces.append(indent + line + "\n")
+
+                # If the block has children and is not a child page, recurse.
+                # We don't recurse into child page content, as this would result in fetching
+                # the children pages' content, which the Notion UI does not show.
+                if (
+                    block.get("has_children", False)
+                    and block.get("type") != BlockType.CHILD_PAGE.value
+                ):
+                    # Fetch all child blocks first
+                    child_blocks = await fetch_blocks(block["id"])
+                    # Then process them all at once
+                    child_markdown = await process_blocks_to_markdown(child_blocks, indent + "    ")
+                    markdown_pieces.append(child_markdown)
+
             return "".join(markdown_pieces)
 
         # Get the title
         page_metadata = await get_object_metadata(context, object_id=page_id)
         markdown_title = f"# {extract_title(page_metadata)}\n"
 
-        # Get the content
-        markdown_content = await fetch_markdown_recursive(page_id, "")
+        # Get all top-level blocks
+        top_level_blocks = await fetch_blocks(page_id)
+
+        chunk_size = max(1, len(top_level_blocks) // 5)
+        chunks = [
+            top_level_blocks[i : i + chunk_size]
+            for i in range(0, len(top_level_blocks), chunk_size)
+        ]
+
+        # Process all block content into markdown
+        results = await asyncio.gather(*[process_blocks_to_markdown(chunk, "") for chunk in chunks])
+        markdown_content = "".join(results)
 
         return markdown_title + markdown_content
 
diff --git a/toolkits/notion/pyproject.toml b/toolkits/notion/pyproject.toml
index 87d98c72..04643ae8 100644
--- a/toolkits/notion/pyproject.toml
+++ b/toolkits/notion/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "arcade_notion_toolkit"
-version = "0.1.2"
+version = "0.1.3"
 description = "Arcade.dev LLM tools for Notion"
 authors = ["ArcadeAI <dev@arcade.dev>"]