From 3be1ecae8a5111e345c33c45ab6aae5754470663 Mon Sep 17 00:00:00 2001
From: LUIS NOVO <lfnovo@gmail.com>
Date: Mon, 4 Nov 2024 15:06:13 -0300
Subject: [PATCH] improve text splitter

---
 open_notebook/utils.py | 60 ++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/open_notebook/utils.py b/open_notebook/utils.py
index 86479e2..b25a79c 100644
--- a/open_notebook/utils.py
+++ b/open_notebook/utils.py
@@ -5,29 +5,10 @@ from urllib.parse import urlparse
 
 import requests
 import tomli
-from langchain_text_splitters import CharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from packaging.version import parse as parse_version
 
 
-def split_text(txt: str, chunk=1000, overlap=0, separator=" "):
-    """
-    Split the input text into chunks.
-
-    Args:
-        txt (str): The input text to be split.
-        chunk (int): The size of each chunk. Default is 1000.
-        overlap (int): The number of characters to overlap between chunks. Default is 0.
-        separator (str): The separator to use when splitting the text. Default is " ".
-
-    Returns:
-        list: A list of text chunks.
-    """
-    text_splitter = CharacterTextSplitter(
-        chunk_size=chunk, chunk_overlap=overlap, separator=separator
-    )
-    return text_splitter.split_text(txt)
-
-
 def token_count(input_string) -> int:
     """
     Count the number of tokens in the input string using the 'o200k_base' encoding.
@@ -60,15 +41,54 @@ def token_cost(token_count, cost_per_million=0.150) -> float:
     return cost_per_million * (token_count / 1_000_000)
 
 
+def split_text(txt: str, chunk_size=500):
+    """
+    Split the input text into chunks.
+
+    Args:
+        txt (str): The input text to be split.
+        chunk (int): The size of each chunk. Default is 1000.
+        overlap (int): The number of characters to overlap between chunks. Default is 0.
+        separator (str): The separator to use when splitting the text. Default is " ".
+
+    Returns:
+        list: A list of text chunks.
+    """
+    overlap = int(chunk_size * 0.15)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=overlap,
+        length_function=token_count,
+        separators=[
+            "\n\n",
+            "\n",
+            ".",
+            ",",
+            " ",
+            "\u200b",  # Zero-width space
+            "\uff0c",  # Fullwidth comma
+            "\u3001",  # Ideographic comma
+            "\uff0e",  # Fullwidth full stop
+            "\u3002",  # Ideographic full stop
+            "",
+        ],
+    )
+    return text_splitter.split_text(txt)
+
+
 def remove_non_ascii(text) -> str:
     return re.sub(r"[^\x00-\x7F]+", "", text)
 
 
 def remove_non_printable(text) -> str:
+    # Replace any special Unicode whitespace characters with a regular space
+    text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)
+
     # Remove control characters, except newlines and tabs
     text = "".join(
         char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
     )
+    # Replace non-breaking spaces with regular spaces
     text = text.replace("\xa0", " ").strip()
     # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
     return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)