diff --git a/open_notebook/graphs/content_process.py b/open_notebook/graphs/content_process.py index 9534db0..2a8b9c3 100644 --- a/open_notebook/graphs/content_process.py +++ b/open_notebook/graphs/content_process.py @@ -1,4 +1,5 @@ import re +import unicodedata import fitz # type: ignore import magic @@ -68,7 +69,8 @@ def _extract_text_from_pdf(pdf_path): for page in doc: text += page.get_text() doc.close() - return text + normalized_content = unicodedata.normalize("NFKD", text) + return normalized_content def extract_pdf(state: SourceState):