fix: handle empty/whitespace source content without retry loop (#576)

Source.vectorize() wrapped its own ValueError in DatabaseOperationError,
bypassing the stop_on=[ValueError] retry guard in process_source_command.
This caused up to 15 retries when processing files with no extractable
text, blocking sync API requests indefinitely.

- Re-raise ValueError directly in Source.vectorize() instead of wrapping
- Add .strip() check to catch whitespace-only content
- Skip vectorization gracefully in save_source() when content is empty
- Add unit tests for vectorize error handling

Fixes #560
This commit is contained in:
Luis Novo 2026-02-14 18:09:07 -03:00 committed by GitHub
parent 877c303b02
commit 26d5349750
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 47 additions and 3 deletions

View file

@ -429,7 +429,7 @@ class Source(ObjectModel):
logger.info(f"Submitting embed_source job for source {self.id}")
try:
if not self.full_text:
if not self.full_text or not self.full_text.strip():
raise ValueError(f"Source {self.id} has no text to vectorize")
# Submit the embed_source command
@ -447,6 +447,8 @@ class Source(ObjectModel):
return command_id_str
except ValueError:
raise
except Exception as e:
logger.error(
f"Failed to submit embed_source job for source {self.id}: {e}"

View file

@ -101,8 +101,13 @@ async def save_source(state: SourceState) -> dict:
# No need to create them here to avoid duplicate edges
if state["embed"]:
logger.debug("Embedding content for vector search")
await source.vectorize()
if source.full_text and source.full_text.strip():
logger.debug("Embedding content for vector search")
await source.vectorize()
else:
logger.warning(
f"Source {source.id} has no text content to embed, skipping vectorization"
)
return {"source": source}

View file

@ -202,6 +202,43 @@ class TestSourceDomain:
mock_delete.assert_called_once()
@pytest.mark.asyncio
async def test_vectorize_raises_valueerror_when_no_text(self):
"""Test that vectorize() raises ValueError (not DatabaseOperationError) for empty text."""
source = Source(id="source:test_empty", title="Test", full_text=None)
with pytest.raises(ValueError, match="has no text to vectorize"):
await source.vectorize()
@pytest.mark.asyncio
async def test_vectorize_raises_valueerror_when_empty_string(self):
"""Test that vectorize() raises ValueError for empty string."""
source = Source(id="source:test_empty_str", title="Test", full_text="")
with pytest.raises(ValueError, match="has no text to vectorize"):
await source.vectorize()
@pytest.mark.asyncio
async def test_vectorize_raises_valueerror_when_whitespace_only(self):
"""Test that vectorize() raises ValueError for whitespace-only text."""
source = Source(id="source:test_ws", title="Test", full_text=" \n\t ")
with pytest.raises(ValueError, match="has no text to vectorize"):
await source.vectorize()
@pytest.mark.asyncio
async def test_vectorize_submits_command_with_valid_text(self):
"""Test that vectorize() submits embed_source command when text is valid."""
source = Source(id="source:test_valid", title="Test", full_text="Real content")
with patch(
"open_notebook.domain.notebook.submit_command", return_value="command:123"
) as mock_submit:
result = await source.vectorize()
mock_submit.assert_called_once_with(
"open_notebook",
"embed_source",
{"source_id": "source:test_valid"},
)
assert result == "command:123"
# ============================================================================
# TEST SUITE 5: Note Domain
# ============================================================================