From 4f33b854dd2e2fb2d0c01d73e34a941f3614dd27 Mon Sep 17 00:00:00 2001 From: Luis Novo Date: Sat, 31 Jan 2026 19:30:56 -0300 Subject: [PATCH] feat: add environment variables for chunk size configuration (#520) Adds OPEN_NOTEBOOK_CHUNK_SIZE and OPEN_NOTEBOOK_CHUNK_OVERLAP environment variables to allow users to configure chunking behavior for different embedding models with varying context window limits. Key changes: - CHUNK_SIZE is now configurable via OPEN_NOTEBOOK_CHUNK_SIZE (default: 1200) - CHUNK_OVERLAP is configurable via OPEN_NOTEBOOK_CHUNK_OVERLAP (default: 15%) - Validation with warnings for invalid or out-of-range values - Updated documentation with configuration examples This enables users of models like mxbai-embed-large with limited context windows to reduce chunk size accordingly. Closes #510 --- open_notebook/utils/CLAUDE.md | 30 ++++++++++++-- open_notebook/utils/chunking.py | 71 +++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/open_notebook/utils/CLAUDE.md b/open_notebook/utils/CLAUDE.md index e811b25..d6a6de0 100644 --- a/open_notebook/utils/CLAUDE.md +++ b/open_notebook/utils/CLAUDE.md @@ -18,6 +18,30 @@ Provides cross-cutting concerns: building LLM context from sources/insights, con Each utility is stateless and can be imported independently. +## Configuration + +### Chunking Configuration (chunking.py) + +The chunking behavior can be configured via environment variables: + +- **OPEN_NOTEBOOK_CHUNK_SIZE**: Maximum chunk size in characters (default: 1200) + - Minimum: 100 characters + - Warnings: Values > 8192 characters or invalid values + - Use case: Smaller models (e.g., mxbai-embed-large with limited context window) + +- **OPEN_NOTEBOOK_CHUNK_OVERLAP**: Overlap between chunks in characters (default: 15% of CHUNK_SIZE) + - Must be: >= 0 and < CHUNK_SIZE + - Warnings: Invalid values or values >= CHUNK_SIZE + - Use case: Control how much context is shared between adjacent chunks + +Example for models with small context windows: +```bash +export OPEN_NOTEBOOK_CHUNK_SIZE=512 +export OPEN_NOTEBOOK_CHUNK_OVERLAP=50 +``` + +Note: Changes require restart of the application. + ## Component Catalog ### context_builder.py @@ -39,8 +63,8 @@ Each utility is stateless and can be imported independently. ### chunking.py - **ContentType**: Enum (HTML, MARKDOWN, PLAIN) -- **CHUNK_SIZE**: constant -- **CHUNK_OVERLAP**: constant +- **CHUNK_SIZE**: Configurable via `OPEN_NOTEBOOK_CHUNK_SIZE` env var (default: 1200) +- **CHUNK_OVERLAP**: Configurable via `OPEN_NOTEBOOK_CHUNK_OVERLAP` env var (default: 15% of CHUNK_SIZE) - **detect_content_type_from_extension(file_path)**: Detect type from file extension - **detect_content_type_from_heuristics(text)**: Detect type from content patterns (returns type + confidence) - **detect_content_type(text, file_path)**: Combined detection (extension primary, heuristics fallback) @@ -125,7 +149,7 @@ Each utility is stateless and can be imported independently. 1. **Add new context source type**: Create fetch method in ContextBuilder; update ContextConfig.sources dict 2. **Add content type**: Add to ContentType enum; create splitter getter; update chunk_text() -3. **Change chunk size**: Modify CHUNK_SIZE and CHUNK_OVERLAP constants in chunking.py +3. **Change chunk size**: Set OPEN_NOTEBOOK_CHUNK_SIZE and OPEN_NOTEBOOK_CHUNK_OVERLAP environment variables 4. **Add text preprocessing**: Add new function to text_utils (e.g., remove_urls, extract_keywords) 5. **Change tokenization**: Replace tiktoken with alternative library in token_utils; update all calls 6. **Add context filtering**: Extend ContextConfig with filter_by_date, filter_by_topic fields diff --git a/open_notebook/utils/chunking.py b/open_notebook/utils/chunking.py index 10ea0f0..3f33ba4 100644 --- a/open_notebook/utils/chunking.py +++ b/open_notebook/utils/chunking.py @@ -7,8 +7,13 @@ Supports HTML, Markdown, and plain text with appropriate splitters for each type Key functions: - detect_content_type(): Detects content type from file extension or content heuristics - chunk_text(): Splits text into chunks using appropriate splitter for content type + +Environment Variables: + OPEN_NOTEBOOK_CHUNK_SIZE: Maximum chunk size in characters (default: 1200) + OPEN_NOTEBOOK_CHUNK_OVERLAP: Overlap between chunks in characters (default: 15% of CHUNK_SIZE) """ +import os import re from enum import Enum from pathlib import Path @@ -21,11 +26,71 @@ from langchain_text_splitters import ( ) from loguru import logger -# Constants -CHUNK_SIZE = 1200 # characters -CHUNK_OVERLAP = 180 # 15% of chunk size + +def _get_chunk_size() -> int: + """Get chunk size from environment variable or use default.""" + chunk_size_str = os.getenv("OPEN_NOTEBOOK_CHUNK_SIZE") + if chunk_size_str: + try: + chunk_size = int(chunk_size_str) + if chunk_size < 100: + logger.warning( + f"OPEN_NOTEBOOK_CHUNK_SIZE ({chunk_size}) is too small. " + f"Using minimum value of 100." + ) + return 100 + if chunk_size > 8192: + logger.warning( + f"OPEN_NOTEBOOK_CHUNK_SIZE ({chunk_size}) is very large. " + f"This may cause issues with some embedding models." + ) + logger.info(f"Using custom chunk size: {chunk_size} characters") + return chunk_size + except ValueError: + logger.warning( + f"Invalid OPEN_NOTEBOOK_CHUNK_SIZE value: '{chunk_size_str}'. " + f"Using default: 1200" + ) + return 1200 + + +def _get_chunk_overlap(chunk_size: int) -> int: + """Get chunk overlap from environment variable or calculate default (15% of chunk size).""" + overlap_str = os.getenv("OPEN_NOTEBOOK_CHUNK_OVERLAP") + if overlap_str: + try: + overlap = int(overlap_str) + if overlap < 0: + logger.warning( + f"OPEN_NOTEBOOK_CHUNK_OVERLAP ({overlap}) cannot be negative. " + f"Using 0." + ) + return 0 + if overlap >= chunk_size: + logger.warning( + f"OPEN_NOTEBOOK_CHUNK_OVERLAP ({overlap}) cannot be >= chunk size ({chunk_size}). " + f"Using 15% of chunk size: {int(chunk_size * 0.15)}" + ) + return int(chunk_size * 0.15) + logger.info(f"Using custom chunk overlap: {overlap} characters") + return overlap + except ValueError: + logger.warning( + f"Invalid OPEN_NOTEBOOK_CHUNK_OVERLAP value: '{overlap_str}'. " + f"Using default: 15% of chunk size" + ) + return int(chunk_size * 0.15) + + +# Constants (computed at import time from environment variables) +CHUNK_SIZE = _get_chunk_size() +CHUNK_OVERLAP = _get_chunk_overlap(CHUNK_SIZE) HIGH_CONFIDENCE_THRESHOLD = 0.8 # Threshold for heuristics to override extension +logger.debug( + f"Chunking configuration: CHUNK_SIZE={CHUNK_SIZE}, CHUNK_OVERLAP={CHUNK_OVERLAP}" +) + class ContentType(Enum): """Content type for chunking strategy selection."""