""" Unit tests for the open_notebook.utils.chunking module. Tests content type detection and text chunking functionality. """ import pytest from open_notebook.utils.chunking import ( CHUNK_SIZE, ContentType, chunk_text, detect_content_type, detect_content_type_from_extension, detect_content_type_from_heuristics, ) # ============================================================================ # TEST SUITE 1: Content Type Detection from Extension # ============================================================================ class TestDetectContentTypeFromExtension: """Test suite for extension-based content type detection.""" def test_html_extensions(self): """Test HTML file extensions.""" assert detect_content_type_from_extension("file.html") == ContentType.HTML assert detect_content_type_from_extension("file.htm") == ContentType.HTML assert detect_content_type_from_extension("file.xhtml") == ContentType.HTML assert detect_content_type_from_extension("/path/to/file.HTML") == ContentType.HTML def test_markdown_extensions(self): """Test Markdown file extensions.""" assert detect_content_type_from_extension("file.md") == ContentType.MARKDOWN assert detect_content_type_from_extension("file.markdown") == ContentType.MARKDOWN assert detect_content_type_from_extension("file.mdown") == ContentType.MARKDOWN assert detect_content_type_from_extension("/path/to/README.MD") == ContentType.MARKDOWN def test_plain_text_extensions(self): """Test plain text file extensions.""" assert detect_content_type_from_extension("file.txt") == ContentType.PLAIN assert detect_content_type_from_extension("file.text") == ContentType.PLAIN def test_code_extensions_as_plain(self): """Test code file extensions are treated as plain text.""" assert detect_content_type_from_extension("file.py") == ContentType.PLAIN assert detect_content_type_from_extension("file.js") == ContentType.PLAIN assert detect_content_type_from_extension("file.json") == ContentType.PLAIN assert detect_content_type_from_extension("file.yaml") == ContentType.PLAIN def test_unknown_extensions(self): """Test unknown extensions return None.""" assert detect_content_type_from_extension("file.xyz") is None assert detect_content_type_from_extension("file.docx") is None assert detect_content_type_from_extension("file.pdf") is None def test_no_extension(self): """Test files without extension.""" assert detect_content_type_from_extension("Makefile") is None assert detect_content_type_from_extension("README") is None def test_none_input(self): """Test None input.""" assert detect_content_type_from_extension(None) is None def test_empty_string(self): """Test empty string input.""" assert detect_content_type_from_extension("") is None # ============================================================================ # TEST SUITE 2: Content Type Detection from Heuristics # ============================================================================ class TestDetectContentTypeFromHeuristics: """Test suite for heuristics-based content type detection.""" def test_html_detection_doctype(self): """Test HTML detection with DOCTYPE.""" html_text = "
Content" content_type, confidence = detect_content_type_from_heuristics(html_text) assert content_type == ContentType.HTML assert confidence >= 0.8 def test_html_detection_tags(self): """Test HTML detection with structural tags.""" html_text = "Content
Content
First paragraph with lots of content.
Second paragraph.
""" chunks = chunk_text(html_text, content_type=ContentType.HTML) assert len(chunks) >= 1 def test_explicit_content_type_markdown(self): """Test chunking with explicit Markdown content type.""" md_text = """# Main Title Introduction paragraph. ## Section 1 Content for section 1. ## Section 2 Content for section 2. """ chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN) assert len(chunks) >= 1 def test_explicit_content_type_plain(self): """Test chunking with explicit plain content type.""" plain_text = "Word " * 500 # ~2500 chars chunks = chunk_text(plain_text, content_type=ContentType.PLAIN) assert len(chunks) >= 1 def test_file_path_detection(self): """Test chunking with file path for content type detection.""" text = "Some content here" chunks = chunk_text(text, file_path="document.md") assert len(chunks) == 1 def test_secondary_chunking_for_large_sections(self): """Test that large sections from HTML/MD splitters are further chunked.""" # Create text that would produce a single large section large_section = "x" * 3000 # Larger than CHUNK_SIZE md_text = f"# Title\n\n{large_section}" chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN) # Should have multiple chunks due to secondary chunking assert len(chunks) >= 1 for chunk in chunks: # Allow some flexibility but chunks should be reasonable size assert len(chunk) <= CHUNK_SIZE + 300 if __name__ == "__main__": pytest.main([__file__, "-v"])