From d0bbe4a921d9b50b67c32aa5d2c92d39a2b61afb Mon Sep 17 00:00:00 2001 From: orihatav Date: Sat, 21 Feb 2026 22:54:15 +0200 Subject: [PATCH] fix: handle tiktoken network errors in offline environments (issue #264) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In air-gapped / offline Docker deployments, tiktoken.get_encoding() tries to download the encoding file from openaipublic.blob.core.windows.net. When that request fails it raises a URLError / OSError — not an ImportError — so the previous except clause silently missed it and the crash surfaced in the UI. Widened `except ImportError` to `except Exception` so all failures — "not installed" and "network unreachable" — fall through to the word-count fallback (words × 1.3). Added a loguru WARNING so operators can see when the fallback is active. TIKTOKEN_CACHE_DIR now reads from the environment with a blank-safe fallback (`or` guard prevents os.makedirs("") on empty env var). This lets Docker images redirect the cache to a path outside /app/data/ so user-data volume mounts cannot shadow the pre-baked encoding. Both images now pre-download the o200k_base encoding during the builder stage (internet is available at build time) and copy it into the runtime image at /app/tiktoken-cache. ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache is set in the runtime stage so no network call is ever needed at runtime. Added test_token_count_network_error_fallback in tests/test_utils.py: patches tiktoken.get_encoding with a URLError and asserts token_count() returns a positive int instead of raising. Fixes #264 Co-Authored-By: Claude Sonnet 4.6 --- Dockerfile | 13 +++++++++++++ Dockerfile.single | 13 +++++++++++++ open_notebook/config.py | 5 ++++- open_notebook/utils/token_utils.py | 11 +++++++++-- tests/test_utils.py | 22 ++++++++++++++++++++++ 5 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9e30e09..848178d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,6 +32,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py # Install dependencies with optimizations (this layer will be cached unless dependencies change) RUN uv sync --frozen --no-dev +# Pre-download tiktoken encoding so the app works offline (issue #264). +# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts +# of /app/data (for user data persistence) do not hide the pre-baked encoding. +# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path. +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache +RUN mkdir -p /app/tiktoken-cache && \ + .venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')" + # Copy the rest of the application code COPY . /app @@ -72,9 +80,14 @@ COPY --from=builder /app/.venv /app/.venv # Copy the source code (the rest) COPY . /app +# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe) +COPY --from=builder /app/tiktoken-cache /app/tiktoken-cache + # Ensure uv uses the existing venv without attempting network operations ENV UV_NO_SYNC=1 ENV VIRTUAL_ENV=/app/.venv +# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py) +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache # Bind Next.js to all interfaces (required for Docker networking and reverse proxies) ENV HOSTNAME=0.0.0.0 diff --git a/Dockerfile.single b/Dockerfile.single index 32ddd28..1eb5e80 100644 --- a/Dockerfile.single +++ b/Dockerfile.single @@ -33,6 +33,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py # Install dependencies RUN uv sync --frozen --no-dev +# Pre-download tiktoken encoding so the app works offline (issue #264). +# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts +# of /app/data (for user data persistence) do not hide the pre-baked encoding. +# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path. +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache +RUN mkdir -p /app/tiktoken-cache && \ + .venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')" + # Stage 5: Runtime FROM python:3.12-slim-bookworm AS runtime @@ -57,6 +65,9 @@ WORKDIR /app COPY --from=backend-builder /app/.venv /app/.venv COPY . /app/ +# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe) +COPY --from=backend-builder /app/tiktoken-cache /app/tiktoken-cache + # Copy built frontend from standalone output COPY --from=frontend-builder /app/frontend/.next/standalone /app/frontend/ COPY --from=frontend-builder /app/frontend/.next/static /app/frontend/.next/static @@ -64,6 +75,8 @@ COPY --from=frontend-builder /app/frontend/public /app/frontend/public # Bind Next.js to all interfaces (required for Docker networking and reverse proxies) ENV HOSTNAME=0.0.0.0 +# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py) +ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache # Setup directories and permissions RUN mkdir -p /app/data /mydata diff --git a/open_notebook/config.py b/open_notebook/config.py index 00d64cb..2db8f74 100644 --- a/open_notebook/config.py +++ b/open_notebook/config.py @@ -13,5 +13,8 @@ UPLOADS_FOLDER = f"{DATA_FOLDER}/uploads" os.makedirs(UPLOADS_FOLDER, exist_ok=True) # TIKTOKEN CACHE FOLDER -TIKTOKEN_CACHE_DIR = f"{DATA_FOLDER}/tiktoken-cache" +# Reads TIKTOKEN_CACHE_DIR from the environment so Docker can redirect the cache +# to a path outside /data/ (which is typically volume-mounted and would hide the +# pre-baked encoding baked into the image at build time). +TIKTOKEN_CACHE_DIR = os.environ.get("TIKTOKEN_CACHE_DIR", "").strip() or f"{DATA_FOLDER}/tiktoken-cache" os.makedirs(TIKTOKEN_CACHE_DIR, exist_ok=True) diff --git a/open_notebook/utils/token_utils.py b/open_notebook/utils/token_utils.py index 9b4a9ea..fd77592 100644 --- a/open_notebook/utils/token_utils.py +++ b/open_notebook/utils/token_utils.py @@ -28,8 +28,15 @@ def token_count(input_string: str) -> int: encoding = tiktoken.get_encoding("o200k_base") tokens = encoding.encode(input_string) return len(tokens) - except ImportError: - # Fallback: simple word count estimation + except Exception: + # Fallback: handles ImportError (not installed) AND network errors + # (e.g., offline environments that can't download encoding from internet) + from loguru import logger + + logger.warning( + "tiktoken unavailable (not installed or offline); " + "falling back to word-count estimation." + ) return int(len(input_string.split()) * 1.3) diff --git a/tests/test_utils.py b/tests/test_utils.py index e686826..389e2b9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -151,6 +151,28 @@ class TestTokenUtilities: assert isinstance(count, int) assert count > 0 + def test_token_count_network_error_fallback(self): + """Test fallback when tiktoken raises a network error (issue #264). + + In offline environments tiktoken.get_encoding() tries to download the + encoding file and raises a URLError/OSError, not an ImportError. + The except clause must catch Exception (not only ImportError) so that + these network failures also fall through to the word-count estimate. + """ + import urllib.error + from unittest.mock import patch + + with patch( + "tiktoken.get_encoding", + side_effect=urllib.error.URLError("No network (simulated offline)"), + ): + text = "one two three four five" + count = token_count(text) + + # Must not raise; must return a positive int via the fallback + assert isinstance(count, int) + assert count > 0 + # ============================================================================ # TEST SUITE 3: Version Utilities