From 01db97924eb471106a5df6eca88535e07bcaa877 Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Thu, 14 Nov 2024 15:19:21 -0300 Subject: [PATCH] improve cleanup function --- open_notebook/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/open_notebook/utils.py b/open_notebook/utils.py index b25a79c..e87690a 100644 --- a/open_notebook/utils.py +++ b/open_notebook/utils.py @@ -84,12 +84,17 @@ def remove_non_printable(text) -> str: # Replace any special Unicode whitespace characters with a regular space text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text) + # Replace unusual line terminators with a single newline + text = re.sub(r"[\u2028\u2029\r]", "\n", text) + # Remove control characters, except newlines and tabs text = "".join( char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" ) + # Replace non-breaking spaces with regular spaces text = text.replace("\xa0", " ").strip() + # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)