diff --git a/Dockerfile b/Dockerfile index 9e30e09..848178d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,6 +32,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py # Install dependencies with optimizations (this layer will be cached unless dependencies change) RUN uv sync --frozen --no-dev +# Pre-download tiktoken encoding so the app works offline (issue #264). +# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts +# of /app/data (for user data persistence) do not hide the pre-baked encoding. +# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path. +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache +RUN mkdir -p /app/tiktoken-cache && \ + .venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')" + # Copy the rest of the application code COPY . /app @@ -72,9 +80,14 @@ COPY --from=builder /app/.venv /app/.venv # Copy the source code (the rest) COPY . /app +# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe) +COPY --from=builder /app/tiktoken-cache /app/tiktoken-cache + # Ensure uv uses the existing venv without attempting network operations ENV UV_NO_SYNC=1 ENV VIRTUAL_ENV=/app/.venv +# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py) +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache # Bind Next.js to all interfaces (required for Docker networking and reverse proxies) ENV HOSTNAME=0.0.0.0 diff --git a/Dockerfile.single b/Dockerfile.single index 32ddd28..1eb5e80 100644 --- a/Dockerfile.single +++ b/Dockerfile.single @@ -33,6 +33,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py # Install dependencies RUN uv sync --frozen --no-dev +# Pre-download tiktoken encoding so the app works offline (issue #264). +# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts +# of /app/data (for user data persistence) do not hide the pre-baked encoding. +# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path. +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache +RUN mkdir -p /app/tiktoken-cache && \ + .venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')" + # Stage 5: Runtime FROM python:3.12-slim-bookworm AS runtime @@ -57,6 +65,9 @@ WORKDIR /app COPY --from=backend-builder /app/.venv /app/.venv COPY . /app/ +# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe) +COPY --from=backend-builder /app/tiktoken-cache /app/tiktoken-cache + # Copy built frontend from standalone output COPY --from=frontend-builder /app/frontend/.next/standalone /app/frontend/ COPY --from=frontend-builder /app/frontend/.next/static /app/frontend/.next/static @@ -64,6 +75,8 @@ COPY --from=frontend-builder /app/frontend/public /app/frontend/public # Bind Next.js to all interfaces (required for Docker networking and reverse proxies) ENV HOSTNAME=0.0.0.0 +# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py) +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache # Setup directories and permissions RUN mkdir -p /app/data /mydata diff --git a/open_notebook/config.py b/open_notebook/config.py index 00d64cb..2db8f74 100644 --- a/open_notebook/config.py +++ b/open_notebook/config.py @@ -13,5 +13,8 @@ UPLOADS_FOLDER = f"{DATA_FOLDER}/uploads" os.makedirs(UPLOADS_FOLDER, exist_ok=True) # TIKTOKEN CACHE FOLDER -TIKTOKEN_CACHE_DIR = f"{DATA_FOLDER}/tiktoken-cache" +# Reads TIKTOKEN_CACHE_DIR from the environment so Docker can redirect the cache +# to a path outside /data/ (which is typically volume-mounted and would hide the +# pre-baked encoding baked into the image at build time). +TIKTOKEN_CACHE_DIR = os.environ.get("TIKTOKEN_CACHE_DIR", "").strip() or f"{DATA_FOLDER}/tiktoken-cache" os.makedirs(TIKTOKEN_CACHE_DIR, exist_ok=True) diff --git a/open_notebook/utils/token_utils.py b/open_notebook/utils/token_utils.py index 9b4a9ea..fd77592 100644 --- a/open_notebook/utils/token_utils.py +++ b/open_notebook/utils/token_utils.py @@ -28,8 +28,15 @@ def token_count(input_string: str) -> int: encoding = tiktoken.get_encoding("o200k_base") tokens = encoding.encode(input_string) return len(tokens) - except ImportError: - # Fallback: simple word count estimation + except Exception: + # Fallback: handles ImportError (not installed) AND network errors + # (e.g., offline environments that can't download encoding from internet) + from loguru import logger + + logger.warning( + "tiktoken unavailable (not installed or offline); " + "falling back to word-count estimation." + ) return int(len(input_string.split()) * 1.3) diff --git a/tests/test_utils.py b/tests/test_utils.py index e686826..389e2b9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -151,6 +151,28 @@ class TestTokenUtilities: assert isinstance(count, int) assert count > 0 + def test_token_count_network_error_fallback(self): + """Test fallback when tiktoken raises a network error (issue #264). + + In offline environments tiktoken.get_encoding() tries to download the + encoding file and raises a URLError/OSError, not an ImportError. + The except clause must catch Exception (not only ImportError) so that + these network failures also fall through to the word-count estimate. + """ + import urllib.error + from unittest.mock import patch + + with patch( + "tiktoken.get_encoding", + side_effect=urllib.error.URLError("No network (simulated offline)"), + ): + text = "one two three four five" + count = token_count(text) + + # Must not raise; must return a positive int via the fallback + assert isinstance(count, int) + assert count > 0 + # ============================================================================ # TEST SUITE 3: Version Utilities