process table of contents for all sources
This commit is contained in:
parent
795fd3bb9d
commit
9b9303d52f
5 changed files with 121 additions and 14 deletions
|
|
@ -11,7 +11,8 @@ from open_notebook.exceptions import (
|
|||
InvalidInputError,
|
||||
NotFoundError,
|
||||
)
|
||||
from open_notebook.graphs.summary import graph as summarizer
|
||||
from open_notebook.graphs.multipattern import graph as pattern_graph
|
||||
from open_notebook.graphs.recursive_toc import graph as toc_graph
|
||||
from open_notebook.repository import (
|
||||
repo_create,
|
||||
repo_delete,
|
||||
|
|
@ -239,8 +240,7 @@ class Source(ObjectModel):
|
|||
|
||||
def vectorize(self) -> None:
|
||||
try:
|
||||
full_text = self.full_text
|
||||
if not full_text:
|
||||
if not self.full_text:
|
||||
return
|
||||
chunks = split_text(
|
||||
self.full_text,
|
||||
|
|
@ -306,15 +306,20 @@ class Source(ObjectModel):
|
|||
logger.error(f"Error adding insight to source {self.id}: {str(e)}")
|
||||
raise DatabaseOperationError(e)
|
||||
|
||||
def summarize(self) -> "Source":
|
||||
def generate_toc_and_title(self) -> "Source":
|
||||
try:
|
||||
config = RunnableConfig(configurable=dict(thread_id=self.id))
|
||||
result = summarizer.invoke({"content": self.full_text}, config=config)[
|
||||
"output"
|
||||
result = toc_graph.invoke({"content": self.full_text}, config=config)
|
||||
logger.warning(result["toc"])
|
||||
self.add_insight("Table of Contents", surreal_clean(result["toc"]))
|
||||
transformations = [
|
||||
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
|
||||
]
|
||||
self.add_insight("summary", surreal_clean(result.summary))
|
||||
self.title = surreal_clean(result.title)
|
||||
self.topics = result.topics
|
||||
output = pattern_graph.invoke(
|
||||
dict(content_stack=[result["toc"]], transformations=transformations)
|
||||
)
|
||||
logger.warning(output["output"])
|
||||
self.title = surreal_clean(output["output"])
|
||||
self.save()
|
||||
return self
|
||||
except Exception as e:
|
||||
|
|
|
|||
78
open_notebook/graphs/recursive_toc.py
Normal file
78
open_notebook/graphs/recursive_toc.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
import os
|
||||
from typing import List, Literal
|
||||
|
||||
from langchain_core.runnables import (
|
||||
RunnableConfig,
|
||||
)
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from open_notebook.graphs.utils import run_pattern
|
||||
from open_notebook.utils import split_text
|
||||
|
||||
|
||||
class TocState(TypedDict):
|
||||
chunks: List[str]
|
||||
content: str
|
||||
toc: str
|
||||
|
||||
|
||||
def build_chunks(state: TocState) -> dict:
|
||||
"""
|
||||
Split the input text into chunks.
|
||||
"""
|
||||
return {
|
||||
"chunks": split_text(
|
||||
state["content"],
|
||||
chunk=int(os.environ.get("SUMMARY_CHUNK_SIZE", 200000)),
|
||||
overlap=int(os.environ.get("SUMMARY_CHUNK_OVERLAP", 1000)),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def setup_next_chunk(state: TocState) -> dict:
|
||||
"""
|
||||
Move the next item in the chunk to the processing area
|
||||
"""
|
||||
state["content"] = state["chunks"].pop(0)
|
||||
return {"chunks": state["chunks"], "content": state["content"]}
|
||||
|
||||
|
||||
def chunk_condition(state: TocState) -> Literal["get_chunk", END]: # type: ignore
|
||||
"""
|
||||
Checks whether there are more chunks to process.
|
||||
"""
|
||||
if len(state["chunks"]) > 0:
|
||||
return "get_chunk"
|
||||
return END
|
||||
|
||||
|
||||
def call_model(state: TocState, config: RunnableConfig) -> dict:
|
||||
model_name = config.get("configurable", {}).get(
|
||||
"model_name", os.environ.get("SUMMARIZATION_MODEL")
|
||||
)
|
||||
return {
|
||||
"toc": run_pattern(
|
||||
pattern_name="recursive_toc",
|
||||
model_name=model_name,
|
||||
state=state,
|
||||
).content
|
||||
}
|
||||
|
||||
|
||||
agent_state = StateGraph(TocState)
|
||||
agent_state.add_node("setup_chunk", build_chunks)
|
||||
agent_state.add_edge(START, "setup_chunk")
|
||||
agent_state.add_conditional_edges(
|
||||
"setup_chunk",
|
||||
chunk_condition,
|
||||
)
|
||||
agent_state.add_node("get_chunk", setup_next_chunk)
|
||||
agent_state.add_node("agent", call_model)
|
||||
agent_state.add_edge("get_chunk", "agent")
|
||||
agent_state.add_conditional_edges(
|
||||
"agent",
|
||||
chunk_condition,
|
||||
)
|
||||
|
||||
graph = agent_state.compile()
|
||||
24
prompts/recursive_toc.jinja
Normal file
24
prompts/recursive_toc.jinja
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
|
||||
# SYSTEM ROLE
|
||||
You are a content analysis assistant that reads through documents and provides a Table of Contents (ToC) to help users identify what the document covers more easily.
|
||||
Your ToC should capture all major topics and transitions in the content and should mention them in the order theh appear.
|
||||
|
||||
# TASK
|
||||
Analyze the provided content and create a Table of Contents:
|
||||
- Captures the core topics included in the text
|
||||
- Gives a small description of what is covered
|
||||
|
||||
# INSTRUCTIONS FOR LARGE DOCUMENTS
|
||||
|
||||
If you see a PREVIOUS TOC section below, it means that this request is a continuation of a previous request. Most likely to handle context length issues.
|
||||
Every time, you should replace the previous toc with the new one, and append the new content to the previous content.
|
||||
|
||||
{% if toc %}
|
||||
# PREVIOUS TOC
|
||||
|
||||
{{toc}}
|
||||
{% endif %}
|
||||
|
||||
# CONTENT
|
||||
|
||||
{{content}}
|
||||
|
|
@ -3,7 +3,7 @@ from langchain_core.runnables import RunnableConfig
|
|||
|
||||
from open_notebook.domain import Note, Source
|
||||
from open_notebook.graphs.chat import graph as chat_graph
|
||||
from open_notebook.utils import token_cost, token_count
|
||||
from open_notebook.utils import token_count
|
||||
|
||||
|
||||
# todo: build a smarter, more robust context manager function
|
||||
|
|
@ -56,11 +56,11 @@ def execute_chat(txt_input, session_id):
|
|||
# seria bom ter um total de tokens no admin em algum lugar
|
||||
def chat_sidebar(session_id):
|
||||
context = build_context(session_id=session_id)
|
||||
tokens = token_count(str(context))
|
||||
cost = token_cost(tokens)
|
||||
tokens = token_count(str(context) + str(st.session_state[session_id]["messages"]))
|
||||
with st.container(border=True):
|
||||
request = st.chat_input("Enter your question")
|
||||
st.caption(f"Total tokens: {tokens}, cost: ${cost:.4f}")
|
||||
# removing for now since it's not multi-model capable right now
|
||||
st.caption(f"Total tokens: {tokens}")
|
||||
if request:
|
||||
response = execute_chat(txt_input=request, session_id=session_id)
|
||||
st.session_state[session_id]["messages"] = response["messages"]
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ def add_source(session_id):
|
|||
source.save()
|
||||
source.add_to_notebook(st.session_state[session_id]["notebook"].id)
|
||||
st.write("Summarizing...")
|
||||
source.summarize()
|
||||
source.generate_toc_and_title()
|
||||
|
||||
st.rerun()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue