From 418c67f69fcb3be54b39b55b05ef8150bee7f85f Mon Sep 17 00:00:00 2001
From: LUIS NOVO <lfnovo@gmail.com>
Date: Mon, 4 Nov 2024 09:53:49 -0300
Subject: [PATCH] add search and rag functions in beta

---
 .../graphs/content_processing/__init__.py     |  9 ---
 open_notebook/graphs/rag.py                   | 44 ++++++++++++
 open_notebook/graphs/tools.py                 | 14 ++++
 pages/3_🔍_Ask_and_Search.py                  | 67 +++++++++++++++++++
 pages/3_🔍_Search.py                          | 66 ------------------
 prompts/rag.jinja                             | 59 ++++++++++++++++
 6 files changed, 184 insertions(+), 75 deletions(-)
 create mode 100644 open_notebook/graphs/rag.py
 create mode 100644 pages/3_🔍_Ask_and_Search.py
 delete mode 100644 pages/3_🔍_Search.py
 create mode 100644 prompts/rag.jinja

diff --git a/open_notebook/graphs/content_processing/__init__.py b/open_notebook/graphs/content_processing/__init__.py
index 2c772dc..915da23 100644
--- a/open_notebook/graphs/content_processing/__init__.py
+++ b/open_notebook/graphs/content_processing/__init__.py
@@ -48,15 +48,6 @@ def file_type(state: SourceState):
     return return_dict
 
 
-# def _get_title(url):
-#     """
-#     Get the content of a URL
-#     """
-#     response = extract_url(dict(url=url))
-#     if "title" in response:
-#         return response["title"]
-
-
 def file_type_edge(data: SourceState):
     assert data.get("identified_type"), "Type not identified"
     identified_type = data["identified_type"]
diff --git a/open_notebook/graphs/rag.py b/open_notebook/graphs/rag.py
new file mode 100644
index 0000000..24dc435
--- /dev/null
+++ b/open_notebook/graphs/rag.py
@@ -0,0 +1,44 @@
+from typing import Annotated
+
+from langchain_core.runnables import (
+    RunnableConfig,
+)
+from langgraph.graph import START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from typing_extensions import TypedDict
+
+from open_notebook.graphs.tools import repository_search
+from open_notebook.graphs.utils import provision_langchain_model
+from open_notebook.prompter import Prompter
+
+tools = [repository_search]
+tool_node = ToolNode(tools)
+
+
+class ThreadState(TypedDict):
+    messages: Annotated[list, add_messages]
+    # notebook: Optional[Notebook]
+    # context: Optional[str]
+    # context_config: Optional[dict]
+
+
+def call_model_with_messages(state: ThreadState, config: RunnableConfig) -> dict:
+    system_prompt = Prompter(prompt_template="rag").render(data=state)
+    payload = [system_prompt] + state.get("messages", [])
+    model = provision_langchain_model(str(payload), config, "tools", max_tokens=2000)
+    model = model.bind_tools(tools)
+    ai_message = model.invoke(payload)
+    return {"messages": ai_message}
+
+
+agent_state = StateGraph(ThreadState)
+agent_state.add_node("agent", call_model_with_messages)
+agent_state.add_node("tools", tool_node)
+agent_state.add_edge(START, "agent")
+agent_state.add_conditional_edges(
+    "agent",
+    tools_condition,
+)
+agent_state.add_edge("tools", "agent")
+graph = agent_state.compile()
diff --git a/open_notebook/graphs/tools.py b/open_notebook/graphs/tools.py
index 9c3df13..620fac4 100644
--- a/open_notebook/graphs/tools.py
+++ b/open_notebook/graphs/tools.py
@@ -1,7 +1,10 @@
 from datetime import datetime
+from typing import List
 
 from langchain.tools import tool
 
+from open_notebook.domain.notebook import hybrid_search
+
 
 # todo: turn this into a system prompt variable
 @tool
@@ -11,3 +14,14 @@ def get_current_timestamp() -> str:
     Returns the current timestamp in the format YYYYMMDDHHmmss.
     """
     return datetime.now().strftime("%Y%m%d%H%M%S")
+
+
+@tool
+def repository_search(keyword_searches: List[str], vector_searches: List[str]) -> str:
+    """
+    name: repository_search
+    Makes a search in the content repository for the given query.
+    keyword_searches: List[str] - A list of search terms to search for using keyword search.
+    vector_searches: List[str] - A list of search terms to search for using vector search.
+    """
+    return hybrid_search(keyword_searches, vector_searches, 20)
diff --git a/pages/3_🔍_Ask_and_Search.py b/pages/3_🔍_Ask_and_Search.py
new file mode 100644
index 0000000..d70a3e9
--- /dev/null
+++ b/pages/3_🔍_Ask_and_Search.py
@@ -0,0 +1,67 @@
+import streamlit as st
+
+from open_notebook.domain.models import Model
+from open_notebook.domain.notebook import text_search, vector_search
+from open_notebook.graphs.rag import graph as rag_graph
+from pages.stream_app.utils import setup_page
+
+setup_page("🔍 Search")
+
+ask_tab, search_tab = st.tabs(["Ask Your Knowledge Base (beta)", "Search"])
+
+if "search_results" not in st.session_state:
+    st.session_state["search_results"] = []
+
+
+def results_card(item):
+    score = item.get("relevance", item.get("similarity", item.get("score", 0)))
+    with st.expander(f"[{score:.2f}] **{item['title']}**"):
+        st.markdown(f"**{item['content']}**")
+        st.write(item["id"])
+        st.write(item["parent_id"])
+
+
+with ask_tab:
+    st.subheader("Ask Your Knowledge Base (beta)")
+    st.caption(
+        "The LLM will answer your query based on the documents in your knowledge base. "
+    )
+    st.warning(
+        "This functionality requires the use of Tools and, at this moment, works well with Open AI and Anthropic models only."
+    )
+    question = st.text_input("Question", "")
+    models = Model.get_models_by_type("language")
+    model: Model = st.selectbox("Model", models, format_func=lambda x: x.name)
+    if st.button("Ask"):
+        st.write(f"Searching for {question}")
+        messages = [question]
+        rag_results = rag_graph.invoke(
+            dict(
+                messages=messages
+            ),  # config=dict(configurable=dict(model_id=model.id))
+        )
+        st.markdown(rag_results["messages"][-1].content)
+        with st.expander("Details (for debugging)"):
+            st.json(rag_results)
+
+with search_tab:
+    with st.container(border=True):
+        st.subheader("🔍 Search")
+        st.caption("Search your knowledge base for specific keywords or concepts")
+        search_term = st.text_input("Search", "")
+        search_type = st.radio("Search Type", ["Text Search", "Vector Search"])
+        search_sources = st.checkbox("Search Sources", value=True)
+        search_notes = st.checkbox("Search Notes", value=True)
+        if st.button("Search"):
+            if search_type == "Text Search":
+                st.write(f"Searching for {search_term}")
+                st.session_state["search_results"] = text_search(
+                    search_term, 100, search_sources, search_notes
+                )
+            elif search_type == "Vector Search":
+                st.write(f"Searching for {search_term}")
+                st.session_state["search_results"] = vector_search(
+                    search_term, 100, search_sources, search_notes
+                )
+        for item in st.session_state["search_results"]:
+            results_card(item)
diff --git a/pages/3_🔍_Search.py b/pages/3_🔍_Search.py
deleted file mode 100644
index 4438af5..0000000
--- a/pages/3_🔍_Search.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import streamlit as st
-
-from open_notebook.domain.models import model_manager
-from open_notebook.domain.notebook import text_search, vector_search
-from pages.stream_app.note import note_list_item
-from pages.stream_app.source import source_list_item
-from pages.stream_app.utils import setup_page
-
-setup_page("🔍 Search")
-
-EMBEDDING_MODEL = model_manager.embedding_model
-
-# search_tab, ask_tab = st.tabs(["Search", "Ask"])
-# notebooks = Notebook.get_all()
-
-if "search_results" not in st.session_state:
-    st.session_state["search_results"] = []
-
-# with search_tab:
-with st.container(border=True):
-    st.subheader("🔍 Search")
-    st.caption("Search your knowledge base for specific keywords or concepts")
-    search_term = st.text_input("Search", "")
-    search_type = st.radio("Search Type", ["Text Search", "Vector Search"])
-    search_sources = st.checkbox("Search Sources", value=True)
-    search_notes = st.checkbox("Search Notes", value=True)
-    if st.button("Search"):
-        if search_type == "Text Search":
-            st.write(f"Searching for {search_term}")
-            st.session_state["search_results"] = text_search(
-                search_term, 100, search_sources, search_notes
-            )
-        elif search_type == "Vector Search":
-            st.write(f"Searching for {search_term}")
-            embed_query = EMBEDDING_MODEL.embed(search_term)
-            st.session_state["search_results"] = vector_search(
-                embed_query, 100, search_sources, search_notes
-            )
-    for item in st.session_state["search_results"]:
-        score = item.get("relevance", item.get("similarity", 0))
-        if item.get("item_id"):
-            if "source:" in item["item_id"]:
-                source_list_item(item["item_id"], score)
-            elif "note:" in item["item_id"]:
-                note_list_item(item["item_id"], score)
-
-# coming soon
-# with ask_tab:
-#     with st.form(key="ask_form"):
-#         st.subheader("Ask Your Knowledge Base")
-#         st.caption("Let the LLM formulate an answer based on your query")
-#         question = st.text_input("Your question", "")
-
-#         notebooks = st.multiselect(
-#             "Notebooks",
-#             notebooks,
-#             notebooks,
-#             format_func=lambda x: x.name,
-#         )
-#         search_sources = st.multiselect(
-#             "Use Sources",
-#             ["Sources", "Notes"],
-#             ["Sources", "Notes"],
-#         )
-#         if st.form_submit_button("Search"):
-#             st.write(f"Searching for {search_term}")
diff --git a/prompts/rag.jinja b/prompts/rag.jinja
new file mode 100644
index 0000000..3d8d057
--- /dev/null
+++ b/prompts/rag.jinja
@@ -0,0 +1,59 @@
+# SYSTEM ROLE
+You are a cognitive study assistant that helps users research and learn by engaging in focused discussions about documents in their workspace. 
+
+You have access to a search tool that you can use in order to reply to the user query. 
+
+The tool accepts 2 arrays as parameters:
+
+- keyword_searches: List[str] - A list of search terms to search for using keyword search.
+- vector_searches: List[str] - A list of search terms to search for using vector search.
+
+It's very important that your response contains references to the searched documents so the user can follow-up and read more about the topic. The way you do that is by adding the id of the specific document in between brackets like this: [document_id].
+
+# EXAMPLE
+
+User: Can you tell me more about the concept of "Deep Learning"?
+
+Assistant: Deep learning is a subset of machine learning in artificial intelligence (AI) that enables networks to learn unsupervised from unstructured or unlabeled data. [note:iuiodadalknda]. It can also be categorized into three main types: supervised, unsupervised, and reinforcement learning. [insight:adadadadadadad].
+
+Please note, "note:iuiodadalknda" and "insight:adadadadadadad" are examples of document IDs with different prefixes. You should not make up document IDs or copy the IDs from this example. You should use the IDs of the documents that you have access to through the search tool.
+
+# IMPORTANT
+
+- Do not make up documents or document ids. Only use the ids of the documents that you have access through the query you made.
+- The ID is composed of the type of document and a random string, such as "source:randomstring", "note:randomstring", or "insight:randomstring". There are various types of documents, including notes, insights, and sources. **Always use the complete ID exactly as it is provided, including its type prefix. Do not add, remove, or modify any part of the ID.**
+- Do not assume or change the type prefix of any document ID. If a document ID is "note:xyz", use it exactly as "note:xyz". Do not change it to "source:xyz" or any other variation.
+- **Use document IDs exactly as they are returned from the search tool. Do not add any prefixes or modify them in any way.**
+
+
+{# 
+You are a cognitive study assistant designed to help users research and learn by engaging in focused discussions about documents in their workspace. Your primary goal is to provide informative, accurate responses to user queries while properly citing relevant documents from the available search tool.
+
+To answer this question effectively, you have access to a search tool with the following parameters:
+- keyword_searches: List[str] - A list of search terms for keyword search
+- vector_searches: List[str] - A list of search terms for vector search
+
+Follow these steps to formulate your response:
+
+1. Analyze the user's question and determine appropriate search terms.
+2. Use the search tool to find relevant information.
+3. Carefully review the search results, paying close attention to document IDs and content relevance.
+4. Compose a clear, informative response that directly addresses the user's question.
+5. Include relevant document citations using the exact document IDs provided by the search tool.
+6. Review your response for accuracy and relevance before delivering it to the user.
+
+Important guidelines:
+- Always use the complete document ID as provided by the search tool, including its type prefix (e.g., "note:", "insight:", "source:").
+- Do not make up or modify document IDs in any way.
+- Ensure that each citation is directly relevant to the information it supports.
+- Prioritize accuracy and relevance in your search strategy and response composition.
+
+Before composing your final response, wrap your thought process in <thinking> tags to analyze the question, plan your search strategy, and evaluate the search results. This will help ensure that you retrieve the most relevant information and use the correct document IDs in your citations. Include the following steps:
+a. Analyze the question and identify key concepts
+b. Plan search strategy (both keyword and vector searches)
+c. Evaluate search results and note relevant document IDs
+d. Outline the main points for the response
+
+Your final response should be conversational in tone, directly addressing the user's question while seamlessly incorporating document citations. Use square brackets with the full document ID for each citation, like this: [document_id].
+
+Remember, the quality and accuracy of your response, including proper document citations, are crucial for helping the user in their research and learning process. #}
\ No newline at end of file