""" Unit tests for the open_notebook.utils.chunking module. Tests content type detection and text chunking functionality. """ import pytest from open_notebook.utils.chunking import ( CHUNK_SIZE, ContentType, chunk_text, detect_content_type, detect_content_type_from_extension, detect_content_type_from_heuristics, ) # ============================================================================ # TEST SUITE 1: Content Type Detection from Extension # ============================================================================ class TestDetectContentTypeFromExtension: """Test suite for extension-based content type detection.""" def test_html_extensions(self): """Test HTML file extensions.""" assert detect_content_type_from_extension("file.html") == ContentType.HTML assert detect_content_type_from_extension("file.htm") == ContentType.HTML assert detect_content_type_from_extension("file.xhtml") == ContentType.HTML assert detect_content_type_from_extension("/path/to/file.HTML") == ContentType.HTML def test_markdown_extensions(self): """Test Markdown file extensions.""" assert detect_content_type_from_extension("file.md") == ContentType.MARKDOWN assert detect_content_type_from_extension("file.markdown") == ContentType.MARKDOWN assert detect_content_type_from_extension("file.mdown") == ContentType.MARKDOWN assert detect_content_type_from_extension("/path/to/README.MD") == ContentType.MARKDOWN def test_plain_text_extensions(self): """Test plain text file extensions.""" assert detect_content_type_from_extension("file.txt") == ContentType.PLAIN assert detect_content_type_from_extension("file.text") == ContentType.PLAIN def test_code_extensions_as_plain(self): """Test code file extensions are treated as plain text.""" assert detect_content_type_from_extension("file.py") == ContentType.PLAIN assert detect_content_type_from_extension("file.js") == ContentType.PLAIN assert detect_content_type_from_extension("file.json") == ContentType.PLAIN assert detect_content_type_from_extension("file.yaml") == ContentType.PLAIN def test_unknown_extensions(self): """Test unknown extensions return None.""" assert detect_content_type_from_extension("file.xyz") is None assert detect_content_type_from_extension("file.docx") is None assert detect_content_type_from_extension("file.pdf") is None def test_no_extension(self): """Test files without extension.""" assert detect_content_type_from_extension("Makefile") is None assert detect_content_type_from_extension("README") is None def test_none_input(self): """Test None input.""" assert detect_content_type_from_extension(None) is None def test_empty_string(self): """Test empty string input.""" assert detect_content_type_from_extension("") is None # ============================================================================ # TEST SUITE 2: Content Type Detection from Heuristics # ============================================================================ class TestDetectContentTypeFromHeuristics: """Test suite for heuristics-based content type detection.""" def test_html_detection_doctype(self): """Test HTML detection with DOCTYPE.""" html_text = "Content" content_type, confidence = detect_content_type_from_heuristics(html_text) assert content_type == ContentType.HTML assert confidence >= 0.8 def test_html_detection_tags(self): """Test HTML detection with structural tags.""" html_text = "Test

Content

" content_type, confidence = detect_content_type_from_heuristics(html_text) assert content_type == ContentType.HTML assert confidence >= 0.5 def test_markdown_detection_headers(self): """Test Markdown detection with headers.""" md_text = """# Main Title ## Section 1 Some content here. ## Section 2 More content. ### Subsection Details here. """ content_type, confidence = detect_content_type_from_heuristics(md_text) assert content_type == ContentType.MARKDOWN assert confidence >= 0.3 # 4 headers give ~0.35 confidence def test_markdown_detection_links(self): """Test Markdown detection with links and headers for stronger signal.""" md_text = """# Documentation Check out [this link](https://example.com) and [another one](https://test.com). ## References Here's some more text with [links](url) and `inline code`.""" content_type, confidence = detect_content_type_from_heuristics(md_text) assert content_type == ContentType.MARKDOWN assert confidence >= 0.4 def test_markdown_detection_code_blocks(self): """Test Markdown detection with code blocks.""" md_text = """# Code Example ```python def hello(): print("Hello, World!") ``` Some explanation text. """ content_type, confidence = detect_content_type_from_heuristics(md_text) assert content_type == ContentType.MARKDOWN assert confidence >= 0.5 def test_plain_text_detection(self): """Test plain text detection.""" plain_text = """This is just regular plain text. It has multiple lines but no special formatting. No headers, no links, no HTML tags. Just regular sentences and paragraphs.""" content_type, confidence = detect_content_type_from_heuristics(plain_text) assert content_type == ContentType.PLAIN def test_short_text(self): """Test short text defaults to plain.""" content_type, confidence = detect_content_type_from_heuristics("Hi") assert content_type == ContentType.PLAIN def test_empty_text(self): """Test empty text defaults to plain.""" content_type, confidence = detect_content_type_from_heuristics("") assert content_type == ContentType.PLAIN # ============================================================================ # TEST SUITE 3: Combined Content Type Detection # ============================================================================ class TestDetectContentType: """Test suite for combined content type detection.""" def test_extension_takes_priority(self): """Test that file extension takes priority over heuristics.""" # Text looks like markdown but file is .txt md_text = "# Header\n\nSome [link](url) content" content_type = detect_content_type(md_text, "file.txt") # Should use extension (plain) unless heuristics are very high confidence # In this case, markdown confidence might override assert content_type in (ContentType.PLAIN, ContentType.MARKDOWN) def test_no_extension_uses_heuristics(self): """Test that heuristics are used when no extension is available.""" html_text = "Test" content_type = detect_content_type(html_text, None) assert content_type == ContentType.HTML def test_extension_html(self): """Test HTML extension detection.""" content_type = detect_content_type("some text", "file.html") assert content_type == ContentType.HTML def test_extension_markdown(self): """Test Markdown extension detection.""" content_type = detect_content_type("some text", "file.md") assert content_type == ContentType.MARKDOWN def test_high_confidence_override(self): """Test that very high confidence heuristics can override plain extension.""" # Strong HTML indicators in a .txt file html_text = "Test

Content

" content_type = detect_content_type(html_text, "file.txt") # High confidence HTML should override .txt extension assert content_type == ContentType.HTML # ============================================================================ # TEST SUITE 4: Text Chunking # ============================================================================ class TestChunkText: """Test suite for text chunking functionality.""" def test_empty_text(self): """Test chunking empty text.""" assert chunk_text("") == [] assert chunk_text(" ") == [] def test_short_text_no_chunking(self): """Test that short text is not chunked.""" text = "This is a short text." chunks = chunk_text(text) assert len(chunks) == 1 assert chunks[0] == text def test_text_at_chunk_limit(self): """Test text at exactly chunk size limit.""" text = "x" * CHUNK_SIZE chunks = chunk_text(text) assert len(chunks) == 1 def test_long_text_is_chunked(self): """Test that long text is chunked.""" # Create text longer than chunk size text = "This is a sentence. " * 200 # ~4000 chars chunks = chunk_text(text) assert len(chunks) > 1 # Each chunk should be <= CHUNK_SIZE for chunk in chunks: assert len(chunk) <= CHUNK_SIZE + 100 # Allow some flexibility for overlap def test_explicit_content_type_html(self): """Test chunking with explicit HTML content type.""" html_text = """

Main Title

First paragraph with lots of content.

Section

Second paragraph.

""" chunks = chunk_text(html_text, content_type=ContentType.HTML) assert len(chunks) >= 1 def test_explicit_content_type_markdown(self): """Test chunking with explicit Markdown content type.""" md_text = """# Main Title Introduction paragraph. ## Section 1 Content for section 1. ## Section 2 Content for section 2. """ chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN) assert len(chunks) >= 1 def test_explicit_content_type_plain(self): """Test chunking with explicit plain content type.""" plain_text = "Word " * 500 # ~2500 chars chunks = chunk_text(plain_text, content_type=ContentType.PLAIN) assert len(chunks) >= 1 def test_file_path_detection(self): """Test chunking with file path for content type detection.""" text = "Some content here" chunks = chunk_text(text, file_path="document.md") assert len(chunks) == 1 def test_secondary_chunking_for_large_sections(self): """Test that large sections from HTML/MD splitters are further chunked.""" # Create text that would produce a single large section large_section = "x" * 3000 # Larger than CHUNK_SIZE md_text = f"# Title\n\n{large_section}" chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN) # Should have multiple chunks due to secondary chunking assert len(chunks) >= 1 for chunk in chunks: # Allow some flexibility but chunks should be reasonable size assert len(chunk) <= CHUNK_SIZE + 300 if __name__ == "__main__": pytest.main([__file__, "-v"])