improve text splitter
This commit is contained in:
parent
418c67f69f
commit
3be1ecae8a
1 changed files with 40 additions and 20 deletions
|
|
@ -5,29 +5,10 @@ from urllib.parse import urlparse
|
|||
|
||||
import requests
|
||||
import tomli
|
||||
from langchain_text_splitters import CharacterTextSplitter
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from packaging.version import parse as parse_version
|
||||
|
||||
|
||||
def split_text(txt: str, chunk=1000, overlap=0, separator=" "):
|
||||
"""
|
||||
Split the input text into chunks.
|
||||
|
||||
Args:
|
||||
txt (str): The input text to be split.
|
||||
chunk (int): The size of each chunk. Default is 1000.
|
||||
overlap (int): The number of characters to overlap between chunks. Default is 0.
|
||||
separator (str): The separator to use when splitting the text. Default is " ".
|
||||
|
||||
Returns:
|
||||
list: A list of text chunks.
|
||||
"""
|
||||
text_splitter = CharacterTextSplitter(
|
||||
chunk_size=chunk, chunk_overlap=overlap, separator=separator
|
||||
)
|
||||
return text_splitter.split_text(txt)
|
||||
|
||||
|
||||
def token_count(input_string) -> int:
|
||||
"""
|
||||
Count the number of tokens in the input string using the 'o200k_base' encoding.
|
||||
|
|
@ -60,15 +41,54 @@ def token_cost(token_count, cost_per_million=0.150) -> float:
|
|||
return cost_per_million * (token_count / 1_000_000)
|
||||
|
||||
|
||||
def split_text(txt: str, chunk_size=500):
|
||||
"""
|
||||
Split the input text into chunks.
|
||||
|
||||
Args:
|
||||
txt (str): The input text to be split.
|
||||
chunk (int): The size of each chunk. Default is 1000.
|
||||
overlap (int): The number of characters to overlap between chunks. Default is 0.
|
||||
separator (str): The separator to use when splitting the text. Default is " ".
|
||||
|
||||
Returns:
|
||||
list: A list of text chunks.
|
||||
"""
|
||||
overlap = int(chunk_size * 0.15)
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=overlap,
|
||||
length_function=token_count,
|
||||
separators=[
|
||||
"\n\n",
|
||||
"\n",
|
||||
".",
|
||||
",",
|
||||
" ",
|
||||
"\u200b", # Zero-width space
|
||||
"\uff0c", # Fullwidth comma
|
||||
"\u3001", # Ideographic comma
|
||||
"\uff0e", # Fullwidth full stop
|
||||
"\u3002", # Ideographic full stop
|
||||
"",
|
||||
],
|
||||
)
|
||||
return text_splitter.split_text(txt)
|
||||
|
||||
|
||||
def remove_non_ascii(text) -> str:
|
||||
return re.sub(r"[^\x00-\x7F]+", "", text)
|
||||
|
||||
|
||||
def remove_non_printable(text) -> str:
|
||||
# Replace any special Unicode whitespace characters with a regular space
|
||||
text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)
|
||||
|
||||
# Remove control characters, except newlines and tabs
|
||||
text = "".join(
|
||||
char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
|
||||
)
|
||||
# Replace non-breaking spaces with regular spaces
|
||||
text = text.replace("\xa0", " ").strip()
|
||||
# Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
|
||||
return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)
|
||||
|
|
|
|||
Loading…
Reference in a new issue