fix: handle empty/whitespace source content without retry loop (#576)
Source.vectorize() wrapped its own ValueError in DatabaseOperationError, bypassing the stop_on=[ValueError] retry guard in process_source_command. This caused up to 15 retries when processing files with no extractable text, blocking sync API requests indefinitely. - Re-raise ValueError directly in Source.vectorize() instead of wrapping - Add .strip() check to catch whitespace-only content - Skip vectorization gracefully in save_source() when content is empty - Add unit tests for vectorize error handling Fixes #560
This commit is contained in:
parent
877c303b02
commit
26d5349750
3 changed files with 47 additions and 3 deletions
|
|
@ -429,7 +429,7 @@ class Source(ObjectModel):
|
|||
logger.info(f"Submitting embed_source job for source {self.id}")
|
||||
|
||||
try:
|
||||
if not self.full_text:
|
||||
if not self.full_text or not self.full_text.strip():
|
||||
raise ValueError(f"Source {self.id} has no text to vectorize")
|
||||
|
||||
# Submit the embed_source command
|
||||
|
|
@ -447,6 +447,8 @@ class Source(ObjectModel):
|
|||
|
||||
return command_id_str
|
||||
|
||||
except ValueError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to submit embed_source job for source {self.id}: {e}"
|
||||
|
|
|
|||
|
|
@ -101,8 +101,13 @@ async def save_source(state: SourceState) -> dict:
|
|||
# No need to create them here to avoid duplicate edges
|
||||
|
||||
if state["embed"]:
|
||||
logger.debug("Embedding content for vector search")
|
||||
await source.vectorize()
|
||||
if source.full_text and source.full_text.strip():
|
||||
logger.debug("Embedding content for vector search")
|
||||
await source.vectorize()
|
||||
else:
|
||||
logger.warning(
|
||||
f"Source {source.id} has no text content to embed, skipping vectorization"
|
||||
)
|
||||
|
||||
return {"source": source}
|
||||
|
||||
|
|
|
|||
|
|
@ -202,6 +202,43 @@ class TestSourceDomain:
|
|||
mock_delete.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vectorize_raises_valueerror_when_no_text(self):
|
||||
"""Test that vectorize() raises ValueError (not DatabaseOperationError) for empty text."""
|
||||
source = Source(id="source:test_empty", title="Test", full_text=None)
|
||||
with pytest.raises(ValueError, match="has no text to vectorize"):
|
||||
await source.vectorize()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vectorize_raises_valueerror_when_empty_string(self):
|
||||
"""Test that vectorize() raises ValueError for empty string."""
|
||||
source = Source(id="source:test_empty_str", title="Test", full_text="")
|
||||
with pytest.raises(ValueError, match="has no text to vectorize"):
|
||||
await source.vectorize()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vectorize_raises_valueerror_when_whitespace_only(self):
|
||||
"""Test that vectorize() raises ValueError for whitespace-only text."""
|
||||
source = Source(id="source:test_ws", title="Test", full_text=" \n\t ")
|
||||
with pytest.raises(ValueError, match="has no text to vectorize"):
|
||||
await source.vectorize()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vectorize_submits_command_with_valid_text(self):
|
||||
"""Test that vectorize() submits embed_source command when text is valid."""
|
||||
source = Source(id="source:test_valid", title="Test", full_text="Real content")
|
||||
with patch(
|
||||
"open_notebook.domain.notebook.submit_command", return_value="command:123"
|
||||
) as mock_submit:
|
||||
result = await source.vectorize()
|
||||
mock_submit.assert_called_once_with(
|
||||
"open_notebook",
|
||||
"embed_source",
|
||||
{"source_id": "source:test_valid"},
|
||||
)
|
||||
assert result == "command:123"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TEST SUITE 5: Note Domain
|
||||
# ============================================================================
|
||||
|
|
|
|||
Loading…
Reference in a new issue