process table of contents for all sources

This commit is contained in:
LUIS NOVO 2024-10-23 15:09:40 -03:00
parent 795fd3bb9d
commit 9b9303d52f
5 changed files with 121 additions and 14 deletions

View file

@ -11,7 +11,8 @@ from open_notebook.exceptions import (
InvalidInputError,
NotFoundError,
)
from open_notebook.graphs.summary import graph as summarizer
from open_notebook.graphs.multipattern import graph as pattern_graph
from open_notebook.graphs.recursive_toc import graph as toc_graph
from open_notebook.repository import (
repo_create,
repo_delete,
@ -239,8 +240,7 @@ class Source(ObjectModel):
def vectorize(self) -> None:
try:
full_text = self.full_text
if not full_text:
if not self.full_text:
return
chunks = split_text(
self.full_text,
@ -306,15 +306,20 @@ class Source(ObjectModel):
logger.error(f"Error adding insight to source {self.id}: {str(e)}")
raise DatabaseOperationError(e)
def summarize(self) -> "Source":
def generate_toc_and_title(self) -> "Source":
try:
config = RunnableConfig(configurable=dict(thread_id=self.id))
result = summarizer.invoke({"content": self.full_text}, config=config)[
"output"
result = toc_graph.invoke({"content": self.full_text}, config=config)
logger.warning(result["toc"])
self.add_insight("Table of Contents", surreal_clean(result["toc"]))
transformations = [
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
]
self.add_insight("summary", surreal_clean(result.summary))
self.title = surreal_clean(result.title)
self.topics = result.topics
output = pattern_graph.invoke(
dict(content_stack=[result["toc"]], transformations=transformations)
)
logger.warning(output["output"])
self.title = surreal_clean(output["output"])
self.save()
return self
except Exception as e:

View file

@ -0,0 +1,78 @@
import os
from typing import List, Literal
from langchain_core.runnables import (
RunnableConfig,
)
from langgraph.graph import END, START, StateGraph
from typing_extensions import TypedDict
from open_notebook.graphs.utils import run_pattern
from open_notebook.utils import split_text
class TocState(TypedDict):
chunks: List[str]
content: str
toc: str
def build_chunks(state: TocState) -> dict:
"""
Split the input text into chunks.
"""
return {
"chunks": split_text(
state["content"],
chunk=int(os.environ.get("SUMMARY_CHUNK_SIZE", 200000)),
overlap=int(os.environ.get("SUMMARY_CHUNK_OVERLAP", 1000)),
)
}
def setup_next_chunk(state: TocState) -> dict:
"""
Move the next item in the chunk to the processing area
"""
state["content"] = state["chunks"].pop(0)
return {"chunks": state["chunks"], "content": state["content"]}
def chunk_condition(state: TocState) -> Literal["get_chunk", END]: # type: ignore
"""
Checks whether there are more chunks to process.
"""
if len(state["chunks"]) > 0:
return "get_chunk"
return END
def call_model(state: TocState, config: RunnableConfig) -> dict:
model_name = config.get("configurable", {}).get(
"model_name", os.environ.get("SUMMARIZATION_MODEL")
)
return {
"toc": run_pattern(
pattern_name="recursive_toc",
model_name=model_name,
state=state,
).content
}
agent_state = StateGraph(TocState)
agent_state.add_node("setup_chunk", build_chunks)
agent_state.add_edge(START, "setup_chunk")
agent_state.add_conditional_edges(
"setup_chunk",
chunk_condition,
)
agent_state.add_node("get_chunk", setup_next_chunk)
agent_state.add_node("agent", call_model)
agent_state.add_edge("get_chunk", "agent")
agent_state.add_conditional_edges(
"agent",
chunk_condition,
)
graph = agent_state.compile()

View file

@ -0,0 +1,24 @@
# SYSTEM ROLE
You are a content analysis assistant that reads through documents and provides a Table of Contents (ToC) to help users identify what the document covers more easily.
Your ToC should capture all major topics and transitions in the content and should mention them in the order theh appear.
# TASK
Analyze the provided content and create a Table of Contents:
- Captures the core topics included in the text
- Gives a small description of what is covered
# INSTRUCTIONS FOR LARGE DOCUMENTS
If you see a PREVIOUS TOC section below, it means that this request is a continuation of a previous request. Most likely to handle context length issues.
Every time, you should replace the previous toc with the new one, and append the new content to the previous content.
{% if toc %}
# PREVIOUS TOC
{{toc}}
{% endif %}
# CONTENT
{{content}}

View file

@ -3,7 +3,7 @@ from langchain_core.runnables import RunnableConfig
from open_notebook.domain import Note, Source
from open_notebook.graphs.chat import graph as chat_graph
from open_notebook.utils import token_cost, token_count
from open_notebook.utils import token_count
# todo: build a smarter, more robust context manager function
@ -56,11 +56,11 @@ def execute_chat(txt_input, session_id):
# seria bom ter um total de tokens no admin em algum lugar
def chat_sidebar(session_id):
context = build_context(session_id=session_id)
tokens = token_count(str(context))
cost = token_cost(tokens)
tokens = token_count(str(context) + str(st.session_state[session_id]["messages"]))
with st.container(border=True):
request = st.chat_input("Enter your question")
st.caption(f"Total tokens: {tokens}, cost: ${cost:.4f}")
# removing for now since it's not multi-model capable right now
st.caption(f"Total tokens: {tokens}")
if request:
response = execute_chat(txt_input=request, session_id=session_id)
st.session_state[session_id]["messages"] = response["messages"]

View file

@ -132,7 +132,7 @@ def add_source(session_id):
source.save()
source.add_to_notebook(st.session_state[session_id]["notebook"].id)
st.write("Summarizing...")
source.summarize()
source.generate_toc_and_title()
st.rerun()