fix: handle tiktoken network errors in offline environments (issue #264)
In air-gapped / offline Docker deployments, tiktoken.get_encoding() tries
to download the encoding file from openaipublic.blob.core.windows.net.
When that request fails it raises a URLError / OSError — not an ImportError
— so the previous except clause silently missed it and the crash surfaced in
the UI.
Widened `except ImportError` to `except Exception` so all failures —
"not installed" and "network unreachable" — fall through to the word-count
fallback (words × 1.3). Added a loguru WARNING so operators can see when
the fallback is active.
TIKTOKEN_CACHE_DIR now reads from the environment with a blank-safe
fallback (`or` guard prevents os.makedirs("") on empty env var). This lets
Docker images redirect the cache to a path outside /app/data/ so user-data
volume mounts cannot shadow the pre-baked encoding.
Both images now pre-download the o200k_base encoding during the builder
stage (internet is available at build time) and copy it into the runtime
image at /app/tiktoken-cache. ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
is set in the runtime stage so no network call is ever needed at runtime.
Added test_token_count_network_error_fallback in tests/test_utils.py:
patches tiktoken.get_encoding with a URLError and asserts token_count()
returns a positive int instead of raising.
Fixes #264
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
38d9ed5986
commit
d0bbe4a921
5 changed files with 61 additions and 3 deletions
13
Dockerfile
13
Dockerfile
|
|
@ -32,6 +32,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py
|
|||
# Install dependencies with optimizations (this layer will be cached unless dependencies change)
|
||||
RUN uv sync --frozen --no-dev
|
||||
|
||||
# Pre-download tiktoken encoding so the app works offline (issue #264).
|
||||
# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts
|
||||
# of /app/data (for user data persistence) do not hide the pre-baked encoding.
|
||||
# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path.
|
||||
ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
|
||||
RUN mkdir -p /app/tiktoken-cache && \
|
||||
.venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')"
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . /app
|
||||
|
||||
|
|
@ -72,9 +80,14 @@ COPY --from=builder /app/.venv /app/.venv
|
|||
# Copy the source code (the rest)
|
||||
COPY . /app
|
||||
|
||||
# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe)
|
||||
COPY --from=builder /app/tiktoken-cache /app/tiktoken-cache
|
||||
|
||||
# Ensure uv uses the existing venv without attempting network operations
|
||||
ENV UV_NO_SYNC=1
|
||||
ENV VIRTUAL_ENV=/app/.venv
|
||||
# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py)
|
||||
ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
|
||||
|
||||
# Bind Next.js to all interfaces (required for Docker networking and reverse proxies)
|
||||
ENV HOSTNAME=0.0.0.0
|
||||
|
|
|
|||
|
|
@ -33,6 +33,14 @@ COPY open_notebook/__init__.py ./open_notebook/__init__.py
|
|||
# Install dependencies
|
||||
RUN uv sync --frozen --no-dev
|
||||
|
||||
# Pre-download tiktoken encoding so the app works offline (issue #264).
|
||||
# /app/tiktoken-cache is intentionally outside /app/data/ so that volume mounts
|
||||
# of /app/data (for user data persistence) do not hide the pre-baked encoding.
|
||||
# config.py reads TIKTOKEN_CACHE_DIR from the environment to pick up this path.
|
||||
ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
|
||||
RUN mkdir -p /app/tiktoken-cache && \
|
||||
.venv/bin/python -c "import tiktoken; tiktoken.get_encoding('o200k_base')"
|
||||
|
||||
# Stage 5: Runtime
|
||||
FROM python:3.12-slim-bookworm AS runtime
|
||||
|
||||
|
|
@ -57,6 +65,9 @@ WORKDIR /app
|
|||
COPY --from=backend-builder /app/.venv /app/.venv
|
||||
COPY . /app/
|
||||
|
||||
# Copy pre-downloaded tiktoken encoding from builder (outside /data/ — volume-mount safe)
|
||||
COPY --from=backend-builder /app/tiktoken-cache /app/tiktoken-cache
|
||||
|
||||
# Copy built frontend from standalone output
|
||||
COPY --from=frontend-builder /app/frontend/.next/standalone /app/frontend/
|
||||
COPY --from=frontend-builder /app/frontend/.next/static /app/frontend/.next/static
|
||||
|
|
@ -64,6 +75,8 @@ COPY --from=frontend-builder /app/frontend/public /app/frontend/public
|
|||
|
||||
# Bind Next.js to all interfaces (required for Docker networking and reverse proxies)
|
||||
ENV HOSTNAME=0.0.0.0
|
||||
# Point the app at the pre-baked tiktoken encoding (see open_notebook/config.py)
|
||||
ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
|
||||
|
||||
# Setup directories and permissions
|
||||
RUN mkdir -p /app/data /mydata
|
||||
|
|
|
|||
|
|
@ -13,5 +13,8 @@ UPLOADS_FOLDER = f"{DATA_FOLDER}/uploads"
|
|||
os.makedirs(UPLOADS_FOLDER, exist_ok=True)
|
||||
|
||||
# TIKTOKEN CACHE FOLDER
|
||||
TIKTOKEN_CACHE_DIR = f"{DATA_FOLDER}/tiktoken-cache"
|
||||
# Reads TIKTOKEN_CACHE_DIR from the environment so Docker can redirect the cache
|
||||
# to a path outside /data/ (which is typically volume-mounted and would hide the
|
||||
# pre-baked encoding baked into the image at build time).
|
||||
TIKTOKEN_CACHE_DIR = os.environ.get("TIKTOKEN_CACHE_DIR", "").strip() or f"{DATA_FOLDER}/tiktoken-cache"
|
||||
os.makedirs(TIKTOKEN_CACHE_DIR, exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -28,8 +28,15 @@ def token_count(input_string: str) -> int:
|
|||
encoding = tiktoken.get_encoding("o200k_base")
|
||||
tokens = encoding.encode(input_string)
|
||||
return len(tokens)
|
||||
except ImportError:
|
||||
# Fallback: simple word count estimation
|
||||
except Exception:
|
||||
# Fallback: handles ImportError (not installed) AND network errors
|
||||
# (e.g., offline environments that can't download encoding from internet)
|
||||
from loguru import logger
|
||||
|
||||
logger.warning(
|
||||
"tiktoken unavailable (not installed or offline); "
|
||||
"falling back to word-count estimation."
|
||||
)
|
||||
return int(len(input_string.split()) * 1.3)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -151,6 +151,28 @@ class TestTokenUtilities:
|
|||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
def test_token_count_network_error_fallback(self):
|
||||
"""Test fallback when tiktoken raises a network error (issue #264).
|
||||
|
||||
In offline environments tiktoken.get_encoding() tries to download the
|
||||
encoding file and raises a URLError/OSError, not an ImportError.
|
||||
The except clause must catch Exception (not only ImportError) so that
|
||||
these network failures also fall through to the word-count estimate.
|
||||
"""
|
||||
import urllib.error
|
||||
from unittest.mock import patch
|
||||
|
||||
with patch(
|
||||
"tiktoken.get_encoding",
|
||||
side_effect=urllib.error.URLError("No network (simulated offline)"),
|
||||
):
|
||||
text = "one two three four five"
|
||||
count = token_count(text)
|
||||
|
||||
# Must not raise; must return a positive int via the fallback
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TEST SUITE 3: Version Utilities
|
||||
|
|
|
|||
Loading…
Reference in a new issue