From 10ec265a444de13dad0d796c8bb5ca93380d9e9d Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Thu, 24 Oct 2024 13:24:39 -0300 Subject: [PATCH] fix pdf encoding --- open_notebook/graphs/content_process.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/open_notebook/graphs/content_process.py b/open_notebook/graphs/content_process.py index 9534db0..2a8b9c3 100644 --- a/open_notebook/graphs/content_process.py +++ b/open_notebook/graphs/content_process.py @@ -1,4 +1,5 @@ import re +import unicodedata import fitz # type: ignore import magic @@ -68,7 +69,8 @@ def _extract_text_from_pdf(pdf_path): for page in doc: text += page.get_text() doc.close() - return text + normalized_content = unicodedata.normalize("NFKD", text) + return normalized_content def extract_pdf(state: SourceState):