From 3be1ecae8a5111e345c33c45ab6aae5754470663 Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Mon, 4 Nov 2024 15:06:13 -0300 Subject: [PATCH] improve text splitter --- open_notebook/utils.py | 60 ++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/open_notebook/utils.py b/open_notebook/utils.py index 86479e2..b25a79c 100644 --- a/open_notebook/utils.py +++ b/open_notebook/utils.py @@ -5,29 +5,10 @@ from urllib.parse import urlparse import requests import tomli -from langchain_text_splitters import CharacterTextSplitter +from langchain_text_splitters import RecursiveCharacterTextSplitter from packaging.version import parse as parse_version -def split_text(txt: str, chunk=1000, overlap=0, separator=" "): - """ - Split the input text into chunks. - - Args: - txt (str): The input text to be split. - chunk (int): The size of each chunk. Default is 1000. - overlap (int): The number of characters to overlap between chunks. Default is 0. - separator (str): The separator to use when splitting the text. Default is " ". - - Returns: - list: A list of text chunks. - """ - text_splitter = CharacterTextSplitter( - chunk_size=chunk, chunk_overlap=overlap, separator=separator - ) - return text_splitter.split_text(txt) - - def token_count(input_string) -> int: """ Count the number of tokens in the input string using the 'o200k_base' encoding. @@ -60,15 +41,54 @@ def token_cost(token_count, cost_per_million=0.150) -> float: return cost_per_million * (token_count / 1_000_000) +def split_text(txt: str, chunk_size=500): + """ + Split the input text into chunks. + + Args: + txt (str): The input text to be split. + chunk (int): The size of each chunk. Default is 1000. + overlap (int): The number of characters to overlap between chunks. Default is 0. + separator (str): The separator to use when splitting the text. Default is " ". + + Returns: + list: A list of text chunks. + """ + overlap = int(chunk_size * 0.15) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=overlap, + length_function=token_count, + separators=[ + "\n\n", + "\n", + ".", + ",", + " ", + "\u200b", # Zero-width space + "\uff0c", # Fullwidth comma + "\u3001", # Ideographic comma + "\uff0e", # Fullwidth full stop + "\u3002", # Ideographic full stop + "", + ], + ) + return text_splitter.split_text(txt) + + def remove_non_ascii(text) -> str: return re.sub(r"[^\x00-\x7F]+", "", text) def remove_non_printable(text) -> str: + # Replace any special Unicode whitespace characters with a regular space + text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text) + # Remove control characters, except newlines and tabs text = "".join( char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" ) + # Replace non-breaking spaces with regular spaces text = text.replace("\xa0", " ").strip() # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)