improve citations and add object page

This commit is contained in:
LUIS NOVO 2024-11-05 16:55:59 -03:00
parent 35c68dff11
commit 3ea4e41a78
14 changed files with 362 additions and 138 deletions

View file

@ -1,3 +1,35 @@
import streamlit as st
st.switch_page("pages/2_📒_Notebooks.py")
from open_notebook.domain.base import ObjectModel
from open_notebook.exceptions import NotFoundError
from pages.components import (
note_panel,
source_embedding_panel,
source_insight_panel,
source_panel,
)
from pages.stream_app.utils import setup_page
setup_page("📒 Open Notebook", sidebar_state="collapsed")
if "object_id" not in st.query_params:
st.switch_page("pages/2_📒_Notebooks.py")
st.stop()
object_id = st.query_params["object_id"]
try:
obj = ObjectModel.get(object_id)
except NotFoundError:
st.switch_page("pages/2_📒_Notebooks.py")
st.stop()
obj_type = object_id.split(":")[0]
if obj_type == "note":
note_panel(object_id)
elif obj_type == "source":
source_panel(object_id)
elif obj_type == "source_insight":
source_insight_panel(object_id)
elif obj_type == "source_embedding":
source_embedding_panel(object_id)

View file

@ -1,5 +1,5 @@
from datetime import datetime
from typing import Any, ClassVar, Dict, List, Optional, Type, TypeVar
from typing import Any, ClassVar, Dict, List, Optional, Type, TypeVar, cast
from loguru import logger
from pydantic import BaseModel, ValidationError, field_validator
@ -29,15 +29,26 @@ class ObjectModel(BaseModel):
@classmethod
def get_all(cls: Type[T], order_by=None) -> List[T]:
try:
# If called from a specific subclass, use its table_name
if cls.table_name:
target_class = cls
table_name = cls.table_name
else:
# This path is taken if called directly from ObjectModel
raise InvalidInputError(
"get_all() must be called from a specific model class"
)
if order_by:
order = f" ORDER BY {order_by}"
else:
order = ""
result = repo_query(f"SELECT * FROM {cls.table_name} {order}")
result = repo_query(f"SELECT * FROM {table_name} {order}")
objects = []
for obj in result:
try:
objects.append(cls(**obj))
objects.append(target_class(**obj))
except Exception as e:
logger.critical(f"Error creating object: {str(e)}")
@ -52,15 +63,44 @@ class ObjectModel(BaseModel):
if not id:
raise InvalidInputError("ID cannot be empty")
try:
# Get the table name from the ID (everything before the first colon)
table_name = id.split(":")[0] if ":" in id else id
# If we're calling from a specific subclass and IDs match, use that class
if cls.table_name and cls.table_name == table_name:
target_class: Type[T] = cls
else:
# Otherwise, find the appropriate subclass based on table_name
found_class = cls._get_class_by_table_name(table_name)
if not found_class:
raise InvalidInputError(f"No class found for table {table_name}")
target_class = cast(Type[T], found_class)
result = repo_query(f"SELECT * FROM {id}")
if result:
return cls(**result[0])
return target_class(**result[0])
else:
raise NotFoundError(f"{cls.table_name} with id {id} not found")
raise NotFoundError(f"{table_name} with id {id} not found")
except Exception as e:
logger.error(f"Error fetching {cls.table_name} with id {id}: {str(e)}")
logger.error(f"Error fetching object with id {id}: {str(e)}")
logger.exception(e)
raise NotFoundError(f"{cls.table_name} with id {id} not found")
raise NotFoundError(f"Object with id {id} not found - {str(e)}")
@classmethod
def _get_class_by_table_name(cls, table_name: str) -> Optional[Type["ObjectModel"]]:
"""Find the appropriate subclass based on table_name."""
def get_all_subclasses(c: Type["ObjectModel"]) -> List[Type["ObjectModel"]]:
all_subclasses: List[Type["ObjectModel"]] = []
for subclass in c.__subclasses__():
all_subclasses.append(subclass)
all_subclasses.extend(get_all_subclasses(subclass))
return all_subclasses
for subclass in get_all_subclasses(ObjectModel):
if hasattr(subclass, "table_name") and subclass.table_name == table_name:
return subclass
return None
def needs_embedding(self) -> bool:
return False

View file

@ -93,10 +93,42 @@ class Asset(BaseModel):
url: Optional[str] = None
class SourceEmbedding(ObjectModel):
table_name: ClassVar[str] = "source_embedding"
content: str
@property
def source(self) -> "Source":
try:
src = repo_query(f"""
select source.* from {self.id} fetch source
""")
return Source(**src[0]["source"])
except Exception as e:
logger.error(f"Error fetching source for embedding {self.id}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError(e)
class SourceInsight(ObjectModel):
table_name: ClassVar[str] = "source_insight"
insight_type: str
content: str
@property
def source(self) -> "Source":
try:
src = repo_query(f"""
select source.* from {self.id} fetch source
""")
return Source(**src[0]["source"])
except Exception as e:
logger.error(f"Error fetching source for insight {self.id}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError(e)
class Source(ObjectModel):
table_name: ClassVar[str] = "source"
@ -112,7 +144,7 @@ class Source(ObjectModel):
return dict(
id=self.id,
title=self.title,
insights=self.insights,
insights=[insight.model_dump() for insight in self.insights],
full_text=self.full_text,
)
else:

View file

@ -3,7 +3,7 @@ import streamlit as st
from open_notebook.domain.models import Model
from open_notebook.domain.notebook import text_search, vector_search
from open_notebook.graphs.rag import graph as rag_graph
from pages.stream_app.utils import setup_page
from pages.stream_app.utils import convert_source_references, setup_page
setup_page("🔍 Search")
@ -40,7 +40,7 @@ with ask_tab:
messages=messages
), # config=dict(configurable=dict(model_id=model.id))
)
st.markdown(rag_results["messages"][-1].content)
st.markdown(convert_source_references(rag_results["messages"][-1].content))
with st.expander("Details (for debugging)"):
st.json(rag_results)

View file

@ -0,0 +1,11 @@
from pages.components.note_panel import note_panel
from pages.components.source_embedding_panel import source_embedding_panel
from pages.components.source_insight import source_insight_panel
from pages.components.source_panel import source_panel
__all__ = [
"note_panel",
"source_embedding_panel",
"source_insight_panel",
"source_panel",
]

View file

@ -0,0 +1,30 @@
import streamlit as st
from loguru import logger
from streamlit_monaco import st_monaco # type: ignore
from open_notebook.domain.notebook import Note
def note_panel(note_id, notebook_id=None):
note: Note = Note.get(note_id)
if not note:
raise ValueError(f"Note not fonud {note_id}")
t_preview, t_edit = st.tabs(["Preview", "Edit"])
with t_preview:
st.subheader(note.title)
st.markdown(note.content)
with t_edit:
note.title = st.text_input("Title", value=note.title)
note.content = st_monaco(
value=note.content, height="600px", language="markdown"
)
if st.button("Save", key=f"pn_edit_note_{note.id or 'new'}"):
logger.debug("Editing note")
note.save()
if not note.id and notebook_id:
note.add_to_notebook(notebook_id)
st.rerun()
if st.button("Delete", type="primary", key=f"delete_note_{note.id or 'new'}"):
logger.debug("Deleting note")
note.delete()
st.rerun()

View file

@ -0,0 +1,17 @@
import streamlit as st
from open_notebook.domain.notebook import SourceEmbedding
def source_embedding_panel(source_embedding_id):
si: SourceEmbedding = SourceEmbedding.get(source_embedding_id)
if not si:
raise ValueError(f"Embedding not found {source_embedding_id}")
with st.container(border=True):
url = f"Navigator?object_id={si.source.id}"
st.markdown("**Original Source**")
st.markdown(f"{si.source.title} [link](%s)" % url)
st.markdown(si.content)
if st.button("Delete", type="primary", key=f"delete_embedding_{si.id or 'new'}"):
si.delete()
st.rerun()

View file

@ -0,0 +1,18 @@
import streamlit as st
from open_notebook.domain.notebook import SourceInsight
def source_insight_panel(source, notebook_id=None):
si: SourceInsight = SourceInsight.get(source)
if not si:
raise ValueError(f"insight not found {source}")
st.subheader(si.insight_type)
with st.container(border=True):
url = f"Navigator?object_id={si.source.id}"
st.markdown("**Original Source**")
st.markdown(f"{si.source.title} [link](%s)" % url)
st.markdown(si.content)
if st.button("Delete", type="primary", key=f"delete_insight_{si.id or 'new'}"):
si.delete()
st.rerun()

View file

@ -0,0 +1,84 @@
import streamlit as st
import streamlit_scrollable_textbox as stx # type: ignore
import yaml
from humanize import naturaltime
from open_notebook.domain.notebook import Source
from open_notebook.utils import surreal_clean
from pages.stream_app.utils import run_patterns
def source_panel(source_id: str, modal=False):
source: Source = Source.get(source_id)
if not source:
raise ValueError(f"Source not found: {source_id}")
current_title = source.title if source.title else "No Title"
source.title = st.text_input("Title", value=current_title)
if source.title != current_title:
st.toast("Saved new Title")
source.save()
process_tab, source_tab = st.tabs(["Process", "Source"])
with process_tab:
c1, c2 = st.columns([3, 1])
with c1:
title = st.empty()
if source.title:
title.subheader(source.title)
if source.asset and source.asset.url:
from_src = f"from URL: {source.asset.url}"
elif source.asset and source.asset.file_path:
from_src = f"from file: {source.asset.file_path}"
else:
from_src = "from text"
st.caption(f"Created {naturaltime(source.created)}, {from_src}")
for insight in source.insights:
with st.expander(f"**{insight.insight_type}**"):
st.markdown(insight.content)
if st.button(
"Delete", type="primary", key=f"delete_insight_{insight.id}"
):
insight.delete()
st.rerun(scope="fragment" if modal else "app")
with c2:
with open("transformations.yaml", "r") as file:
transformations = yaml.safe_load(file)
for transformation in transformations["source_insights"]:
if st.button(
transformation["name"], help=transformation["description"]
):
result = run_patterns(
source.full_text, transformation["patterns"]
)
source.add_insight(
transformation["insight_type"], surreal_clean(result)
)
st.rerun(scope="fragment" if modal else "app")
if st.button(
"Embed vectors",
icon="🦾",
disabled=source.embedded_chunks > 0,
help="This will generate your embedding vectors on the database for powerful search capabilities",
):
source.vectorize()
st.success("Embedding complete")
chk_delete = st.checkbox(
"🗑️ Delete source", key=f"delete_source_{source.id}", value=False
)
if chk_delete:
st.warning(
"Source will be deleted with all its insights and embeddings"
)
if st.button(
"Delete", type="primary", key=f"bt_delete_source_{source.id}"
):
source.delete()
st.rerun()
with source_tab:
st.subheader("Content")
stx.scrollableTextbox(source.full_text, height=300)

View file

@ -1,12 +1,18 @@
from typing import Union
import humanize
import streamlit as st
from langchain_core.runnables import RunnableConfig
from open_notebook.domain.base import ObjectModel
from open_notebook.domain.notebook import ChatSession, Note, Notebook, Source
from open_notebook.graphs.chat import graph as chat_graph
from open_notebook.plugins.podcasts import PodcastConfig
from open_notebook.utils import token_count
from pages.stream_app.utils import create_session_for_notebook
from pages.stream_app.utils import (
convert_source_references,
create_session_for_notebook,
)
from .note import make_note_from_chat
@ -26,13 +32,7 @@ def build_context(notebook_id):
if "not in" in status:
continue
# todo: there is problably a better way to handle this
if item_type == "note":
item: Note = Note.get(id)
elif item_type == "source":
item: Source = Source.get(id)
else:
continue
item: Union[Note, Source] = ObjectModel.get(id)
if not item:
continue
@ -48,9 +48,10 @@ def build_context(notebook_id):
return st.session_state[notebook_id]["context"]
def execute_chat(txt_input, current_session):
def execute_chat(txt_input, context, current_session):
current_state = st.session_state[current_session.id]
current_state["messages"] += [txt_input]
current_state["context"] = context
result = chat_graph.invoke(
input=current_state,
config=RunnableConfig(configurable={"thread_id": current_session.id}),
@ -146,10 +147,10 @@ def chat_sidebar(current_notebook: Notebook, current_session: ChatSession):
with st.container(border=True):
request = st.chat_input("Enter your question")
# removing for now since it's not multi-model capable right now
st.caption(f"Total tokens: {tokens}")
if request:
response = execute_chat(
txt_input=request,
context=context,
current_session=current_session,
)
st.session_state[current_session.id]["messages"] = response["messages"]
@ -161,7 +162,7 @@ def chat_sidebar(current_notebook: Notebook, current_session: ChatSession):
continue
with st.chat_message(name=msg.type):
st.write(msg.content)
st.markdown(convert_source_references(msg.content))
if msg.type == "ai":
if st.button("💾 New Note", key=f"render_save_{msg.id}"):
make_note_from_chat(

View file

@ -3,11 +3,11 @@ from typing import Optional
import streamlit as st
from humanize import naturaltime
from loguru import logger
from streamlit_monaco import st_monaco # type: ignore
from open_notebook.domain.notebook import Note
from open_notebook.graphs.multipattern import graph as pattern_graph
from open_notebook.utils import surreal_clean
from pages.components import note_panel
from .consts import context_icons
@ -25,29 +25,8 @@ def add_note(notebook_id):
@st.dialog("Add a Source", width="large")
def note_panel(notebook_id=None, note: Optional[Note] = None):
if not note:
note: Note = Note(note_type="human")
t_preview, t_edit = st.tabs(["Preview", "Edit"])
with t_preview:
st.subheader(note.title)
st.markdown(note.content)
with t_edit:
note.title = st.text_input("Title", value=note.title)
note.content = st_monaco(
value=note.content, height="600px", language="markdown"
)
if st.button("Save", key=f"pn_edit_note_{note.id or 'new'}"):
logger.debug("Editing note")
note.save()
if not note.id:
note.add_to_notebook(notebook_id)
st.rerun()
if st.button("Delete", type="primary", key=f"delete_note_{note.id or 'new'}"):
logger.debug("Deleting note")
note.delete()
st.rerun()
def note_panel_dialog(note: Optional[Note] = None, notebook_id=None):
note_panel(note_id=note.id, notebook_id=notebook_id)
def make_note_from_chat(content, notebook_id=None):
@ -88,7 +67,7 @@ def note_card(note, notebook_id):
st.caption(f"Updated: {naturaltime(note.updated)}")
if st.button("Expand", icon="📝", key=f"edit_note_{note.id}"):
note_panel(notebook_id=notebook_id, note=note)
note_panel_dialog(notebook_id=notebook_id, note=note)
st.session_state[notebook_id]["context_config"][note.id] = context_state
@ -105,4 +84,4 @@ def note_list_item(note_id, score=None):
):
st.write(note.content)
if st.button("Edit Note", icon="📝", key=f"x_edit_note_{note.id}"):
note_panel(note=note)
note_panel_dialog(note=note)

View file

@ -2,8 +2,6 @@ import os
from pathlib import Path
import streamlit as st
import streamlit_scrollable_textbox as stx # type: ignore
import yaml
from humanize import naturaltime
from loguru import logger
@ -11,17 +9,13 @@ from open_notebook.config import UPLOADS_FOLDER
from open_notebook.domain.notebook import Asset, Source
from open_notebook.exceptions import UnsupportedTypeException
from open_notebook.graphs.content_processing import graph
from open_notebook.graphs.multipattern import graph as transform_graph
from open_notebook.utils import surreal_clean
from pages.components import source_panel
from pages.stream_app.utils import run_patterns
from .consts import context_icons
def run_patterns(input_text, patterns):
output = transform_graph.invoke(dict(content_stack=[input_text], patterns=patterns))
return output["output"]
# moved it here to replace it with the pipeline on 0.1.0
def generate_toc_and_title(source) -> "Source":
try:
@ -43,80 +37,8 @@ def generate_toc_and_title(source) -> "Source":
@st.dialog("Source", width="large")
def source_panel(source_id):
source: Source = Source.get(source_id)
if not source:
st.error("Source not found")
return
current_title = source.title if source.title else "No Title"
source.title = st.text_input("Title", value=current_title)
if source.title != current_title:
st.toast("Saved new Title")
source.save()
process_tab, source_tab = st.tabs(["Process", "Source"])
with process_tab:
c1, c2 = st.columns([3, 1])
with c1:
title = st.empty()
if source.title:
title.subheader(source.title)
if source.asset.url:
from_src = f"from URL: {source.asset.url}"
elif source.asset.file_path:
from_src = f"from file: {source.asset.file_path}"
else:
from_src = "from text"
st.caption(f"Created {naturaltime(source.created)}, {from_src}")
for insight in source.insights:
with st.expander(f"**{insight.insight_type}**"):
st.markdown(insight.content)
if st.button(
"Delete", type="primary", key=f"delete_insight_{insight.id}"
):
insight.delete()
st.rerun(scope="fragment")
with c2:
with open("transformations.yaml", "r") as file:
transformations = yaml.safe_load(file)
for transformation in transformations["source_insights"]:
if st.button(
transformation["name"], help=transformation["description"]
):
result = run_patterns(
source.full_text, transformation["patterns"]
)
source.add_insight(
transformation["insight_type"], surreal_clean(result)
)
st.rerun(scope="fragment")
if st.button(
"Embed vectors",
icon="🦾",
disabled=source.embedded_chunks > 0,
help="This will generate your embedding vectors on the database for powerful search capabilities",
):
source.vectorize()
st.success("Embedding complete")
chk_delete = st.checkbox(
"🗑️ Delete source", key=f"delete_source_{source.id}", value=False
)
if chk_delete:
st.warning(
"Source will be deleted with all its insights and embeddings"
)
if st.button(
"Delete", type="primary", key=f"bt_delete_source_{source.id}"
):
source.delete()
st.rerun()
with source_tab:
st.subheader("Content")
stx.scrollableTextbox(source.full_text, height=300)
def source_panel_dialog(source_id):
source_panel(source_id)
@st.dialog("Add a Source", width="large")
@ -207,7 +129,7 @@ def source_card(source, notebook_id):
f"Updated: {naturaltime(source.updated)}, **{len(source.insights)}** insights"
)
if st.button("Expand", icon="📝", key=source.id):
source_panel(source.id)
source_panel_dialog(source.id)
st.session_state[notebook_id]["context_config"][source.id] = context_state
@ -226,4 +148,4 @@ def source_list_item(source_id, score=None):
st.markdown(f"**{insight.insight_type}**")
st.write(insight.content)
if st.button("Edit source", icon="📝", key=f"x_edit_source_{source.id}"):
source_panel(source_id=source.id)
source_panel_dialog(source_id=source.id)

View file

@ -1,3 +1,4 @@
import re
from datetime import datetime
from typing import List, Union
@ -8,6 +9,7 @@ from open_notebook.database.migrate import MigrationManager
from open_notebook.domain.models import model_manager
from open_notebook.domain.notebook import ChatSession, Notebook
from open_notebook.graphs.chat import ThreadState, graph
from open_notebook.graphs.multipattern import graph as transform_graph
from open_notebook.utils import (
compare_versions,
get_installed_version,
@ -15,6 +17,11 @@ from open_notebook.utils import (
)
def run_patterns(input_text, patterns):
output = transform_graph.invoke(dict(content_stack=[input_text], patterns=patterns))
return output["output"]
def version_sidebar():
with st.sidebar:
try:
@ -76,8 +83,6 @@ def setup_stream_state(current_notebook: Notebook) -> ChatSession:
logger.debug("Getting last updated session")
chat_session = sessions[0]
logger.debug(f"Chat session: {chat_session}")
if not chat_session or chat_session.id is None:
raise ValueError("Problem acquiring chat session")
# sets the active session for the notebook
@ -163,3 +168,36 @@ def setup_page(title: str, layout="wide", sidebar_state="expanded"):
check_migration()
check_models()
version_sidebar()
def convert_source_references(text):
"""
Converts source references in brackets to markdown-style links.
Matches patterns like [source_insight:id], [note:id], [source:id], or [source_embedding:id]
and converts them to markdown links.
Args:
text (str): The input text containing source references
Returns:
str: Text with source references converted to markdown links
Example:
>>> text = "Here is a reference [source_insight:abc123]"
>>> convert_source_references(text)
'Here is a reference [source_insight:abc123](/?object_id=source_insight:abc123)'
"""
# Pattern matches [type:id] where type can be source_insight, note, source, or source_embedding
pattern = r"\[((?:source_insight|note|source|source_embedding):[\w\d]+)\]"
def replace_match(match):
"""Helper function to create the markdown link"""
source_ref = match.group(1) # Gets the content inside brackets
return f"[[{source_ref}]](/?object_id={source_ref})"
# Replace all matches in the text
converted_text = re.sub(pattern, replace_match, text)
return converted_text

View file

@ -5,9 +5,8 @@ You are a cognitive study assistant that helps users research and learn by engag
- Access to project information and selected documents (CONTEXT)
- Can engage in natural dialogue while maintaining academic rigor
# FORMULATE YOUR DATA
- Generate your answer based on the CONTEXT information
- Ensure that your response is accurate and relevant to the user's query
# YOUR OPERATING METHOD
Whenever a user asks you a question, you need to identify the query context and the user intent. The user might be continuing a previous conversation or asking a new question. Looking at the CONTEXT will probably give you a hint of what the user is looking for. Once you identify the user intent, formulate your answer accordingly paying attention to the CITING INSTRUCTIONS below.
{% if notebook %}
# PROJECT INFORMATION
@ -18,5 +17,26 @@ You are a cognitive study assistant that helps users research and learn by engag
{% if context %}
# CONTEXT
The user has selected this context to help you with your response:
{{context}}
{% endif %}
{% endif %}
# CITING INSTRUCTIONS
If your answer is based off of any item in the context, it's very important that your response contains references to the searched documents so the user can follow-up and read more about the topic. The way you do that is by adding the id of the specific document in between brackets like this: [document_id].
## EXAMPLE
User: Can you tell me more about the concept of "Deep Learning"?
Assistant: Deep learning is a subset of machine learning in artificial intelligence (AI) that enables networks to learn unsupervised from unstructured or unlabeled data. [note:iuiodadalknda]. It can also be categorized into three main types: supervised, unsupervised, and reinforcement learning. [insight:adadadadadadad].
Please note, "note:iuiodadalknda" and "insight:adadadadadadad" are examples of document IDs with different prefixes. You should not make up document IDs or copy the IDs from this example. You should use the IDs of the documents that you have access to through the search tool.
## IMPORTANT
- Do not make up documents or document ids. Only use the ids of the documents that you have access through the query you made.
- The ID is composed of the type of document and a random string, such as "source:randomstring", "note:randomstring", or "insight:randomstring". There are various types of documents, including notes, insights, and sources. **Always use the complete ID exactly as it is provided, including its type prefix. Do not add, remove, or modify any part of the ID.**
- Do not assume or change the type prefix of any document ID. If a document ID is "note:xyz", use it exactly as "note:xyz". Do not change it to "source:xyz" or any other variation.
- **Use document IDs exactly as they are returned from the search tool. Do not add any prefixes or modify them in any way.**