From 26d53497503b46c37975b50e5f3505770049f5f0 Mon Sep 17 00:00:00 2001 From: Luis Novo Date: Sat, 14 Feb 2026 18:09:07 -0300 Subject: [PATCH] fix: handle empty/whitespace source content without retry loop (#576) Source.vectorize() wrapped its own ValueError in DatabaseOperationError, bypassing the stop_on=[ValueError] retry guard in process_source_command. This caused up to 15 retries when processing files with no extractable text, blocking sync API requests indefinitely. - Re-raise ValueError directly in Source.vectorize() instead of wrapping - Add .strip() check to catch whitespace-only content - Skip vectorization gracefully in save_source() when content is empty - Add unit tests for vectorize error handling Fixes #560 --- open_notebook/domain/notebook.py | 4 +++- open_notebook/graphs/source.py | 9 ++++++-- tests/test_domain.py | 37 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/open_notebook/domain/notebook.py b/open_notebook/domain/notebook.py index 05dc8f2..656255f 100644 --- a/open_notebook/domain/notebook.py +++ b/open_notebook/domain/notebook.py @@ -429,7 +429,7 @@ class Source(ObjectModel): logger.info(f"Submitting embed_source job for source {self.id}") try: - if not self.full_text: + if not self.full_text or not self.full_text.strip(): raise ValueError(f"Source {self.id} has no text to vectorize") # Submit the embed_source command @@ -447,6 +447,8 @@ class Source(ObjectModel): return command_id_str + except ValueError: + raise except Exception as e: logger.error( f"Failed to submit embed_source job for source {self.id}: {e}" diff --git a/open_notebook/graphs/source.py b/open_notebook/graphs/source.py index 02d1e75..dc55390 100644 --- a/open_notebook/graphs/source.py +++ b/open_notebook/graphs/source.py @@ -101,8 +101,13 @@ async def save_source(state: SourceState) -> dict: # No need to create them here to avoid duplicate edges if state["embed"]: - logger.debug("Embedding content for vector search") - await source.vectorize() + if source.full_text and source.full_text.strip(): + logger.debug("Embedding content for vector search") + await source.vectorize() + else: + logger.warning( + f"Source {source.id} has no text content to embed, skipping vectorization" + ) return {"source": source} diff --git a/tests/test_domain.py b/tests/test_domain.py index 7fe5167..917c163 100644 --- a/tests/test_domain.py +++ b/tests/test_domain.py @@ -202,6 +202,43 @@ class TestSourceDomain: mock_delete.assert_called_once() + @pytest.mark.asyncio + async def test_vectorize_raises_valueerror_when_no_text(self): + """Test that vectorize() raises ValueError (not DatabaseOperationError) for empty text.""" + source = Source(id="source:test_empty", title="Test", full_text=None) + with pytest.raises(ValueError, match="has no text to vectorize"): + await source.vectorize() + + @pytest.mark.asyncio + async def test_vectorize_raises_valueerror_when_empty_string(self): + """Test that vectorize() raises ValueError for empty string.""" + source = Source(id="source:test_empty_str", title="Test", full_text="") + with pytest.raises(ValueError, match="has no text to vectorize"): + await source.vectorize() + + @pytest.mark.asyncio + async def test_vectorize_raises_valueerror_when_whitespace_only(self): + """Test that vectorize() raises ValueError for whitespace-only text.""" + source = Source(id="source:test_ws", title="Test", full_text=" \n\t ") + with pytest.raises(ValueError, match="has no text to vectorize"): + await source.vectorize() + + @pytest.mark.asyncio + async def test_vectorize_submits_command_with_valid_text(self): + """Test that vectorize() submits embed_source command when text is valid.""" + source = Source(id="source:test_valid", title="Test", full_text="Real content") + with patch( + "open_notebook.domain.notebook.submit_command", return_value="command:123" + ) as mock_submit: + result = await source.vectorize() + mock_submit.assert_called_once_with( + "open_notebook", + "embed_source", + {"source_id": "source:test_valid"}, + ) + assert result == "command:123" + + # ============================================================================ # TEST SUITE 5: Note Domain # ============================================================================