v1 of transformations

2024-10-23 10:59:05 -03:00 · 2024-10-23 10:59:05 -03:00 · 02ff05b6fd
commit 02ff05b6fd
parent e020511876
16 changed files with 389 additions and 40 deletions
--- a/open_notebook/graphs/multipattern.py
+++ b/open_notebook/graphs/multipattern.py
@ -0,0 +1,66 @@
+import operator
+import os
+from typing import List, Literal, Sequence
+
+from langchain_core.runnables import (
+    RunnableConfig,
+)
+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from typing_extensions import Annotated, TypedDict
+
+from open_notebook.graphs.utils import run_pattern
+
+
+class PatternChainState(TypedDict):
+    content_stack: Annotated[Sequence[str], operator.add]
+    transformations: List[str]
+    output: str
+
+
+def call_model(state: dict, config: RunnableConfig) -> dict:
+    model_name = config.get("configurable", {}).get(
+        "model_name", os.environ.get("DEFAULT_MODEL")
+    )
+    transformations = state["transformations"]
+    current_transformation = transformations.pop(0)
+    if current_transformation.startswith("patterns/"):
+        input_args = {"input_text": state["content_stack"][-1]}
+    else:
+        input_args = {
+            "input_text": state["content_stack"][-1],
+            "command": current_transformation,
+        }
+        current_transformation = "patterns/custom"
+
+    logger.warning(f"Processing transformation: {current_transformation}")
+    logger.debug(f"Using input: {input_args}")
+    transformation_result = run_pattern(
+        pattern_name=current_transformation,
+        model_name=model_name,
+        state=input_args,
+    )
+    return {
+        "content_stack": [transformation_result.content],
+        "output": transformation_result.content,
+        "transformations": state["transformations"],
+    }
+
+
+def transform_condition(state: PatternChainState) -> Literal["agent", END]:  # type: ignore
+    """
+    Checks whether there are more chunks to process.
+    """
+    if len(state["transformations"]) > 0:
+        return "agent"
+    return END
+
+
+agent_state = StateGraph(PatternChainState)
+agent_state.add_node("agent", call_model)
+agent_state.add_edge(START, "agent")
+agent_state.add_conditional_edges(
+    "agent",
+    transform_condition,
+)
+graph = agent_state.compile()
--- a/open_notebook/graphs/pattern.py
+++ b/open_notebook/graphs/pattern.py
@ -0,0 +1,35 @@
+import os
+
+from langchain_core.runnables import (
+    RunnableConfig,
+)
+from langgraph.graph import END, START, StateGraph
+from typing_extensions import TypedDict
+
+from open_notebook.graphs.utils import run_pattern
+
+
+class PatternState(TypedDict):
+    input_text: str
+    pattern: str
+    output: str
+
+
+def call_model(state: dict, config: RunnableConfig) -> dict:
+    model_name = config.get("configurable", {}).get(
+        "model_name", os.environ.get("DEFAULT_MODEL")
+    )
+    return {
+        "output": run_pattern(
+            pattern_name=state["pattern"],
+            model_name=model_name,
+            state=state,
+        )
+    }
+
+
+agent_state = StateGraph(PatternState)
+agent_state.add_node("agent", call_model)
+agent_state.add_edge(START, "agent")
+agent_state.add_edge("agent", END)
+graph = agent_state.compile()
--- a/open_notebook/graphs/summary.py
+++ b/open_notebook/graphs/summary.py
@ -57,7 +57,6 @@ def chunk_condition(state: SummaryState) -> Literal["get_chunk", END]:  # type:
    return END


-# todo: build a helper method for LLM communication on all graphs
 def call_model(state: SummaryState, config: RunnableConfig) -> dict:
    model_name = config.get("configurable", {}).get(
        "model_name", os.environ.get("SUMMARIZATION_MODEL")
--- a/open_notebook/prompter.py
+++ b/open_notebook/prompter.py
@ -9,7 +9,15 @@ from typing import Any, Optional, Union

 from jinja2 import Environment, FileSystemLoader, Template

-env = Environment(loader=FileSystemLoader(os.environ.get("PROMPT_PATH", "prompts")))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+
+project_root = os.path.dirname(current_dir)
+
+env = Environment(
+    loader=FileSystemLoader(
+        os.path.join(project_root, os.environ.get("PROMPT_PATH", "prompts"))
+    )
+)


@dataclass
--- a/prompts/patterns/analyze_paper.jinja
+++ b/prompts/patterns/analyze_paper.jinja
@ -0,0 +1,42 @@
+{% include 'patterns/common_text.jinja' %}
+
+# IDENTITY and PURPOSE
+
+You are an insightful and analytical reader of academic papers, extracting the key components, significance, and broader implications. Your focus is to uncover the core contributions, practical applications, methodological strengths or weaknesses, and any surprising findings. You are especially attuned to the clarity of arguments, the relevance to existing literature, and potential impacts on both the specific field and broader contexts.
+
+# STEPS
+
+1. **READ AND UNDERSTAND THE PAPER**: Thoroughly read the paper, identifying its main focus, arguments, methods, results, and conclusions.
+
+2. **IDENTIFY CORE ELEMENTS**:
+   - **Purpose**: What is the main goal or research question?
+   - **Contribution**: What new knowledge or innovation does this paper bring to the field?
+   - **Methods**: What methods are used, and are they novel or particularly effective?
+   - **Key Findings**: What are the most critical results, and why do they matter?
+   - **Limitations**: Are there any notable limitations or areas for further research?
+
+3. **SYNTHESIZE THE MAIN POINTS**:
+   - Extract the key elements and organize them into insightful observations.
+   - Highlight the broader impact and potential applications.
+   - Note any aspects that challenge established views or introduce new questions.
+
+# OUTPUT INSTRUCTIONS
+
+- Structure the output as follows: 
+  - **PURPOSE**: A concise summary of the main research question or goal (1-2 sentences).
+  - **CONTRIBUTION**: A bullet list of 2-3 points that describe what the paper adds to the field.
+  - **KEY FINDINGS**: A bullet list of 2-3 points summarizing the critical outcomes of the study.
+  - **IMPLICATIONS**: A bullet list of 2-3 points discussing the significance or potential impact of the findings on the field or broader context.
+  - **LIMITATIONS**: A bullet list of 1-2 points identifying notable limitations or areas for future work.
+
+- **Bullet Points** should be between 15-20 words.
+- Avoid starting each bullet point with the same word to maintain variety.
+- Use clear and concise language that conveys the key ideas effectively.
+- Do not include warnings, disclaimers, or personal opinions.
+- Output only the requested sections with their respective labels.
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/cleanup.jinja
+++ b/prompts/patterns/cleanup.jinja
@ -0,0 +1,6 @@
+{% include 'patterns/common_text.jinja' %}
+
+Please clean-up the following text, fixing the paragraphs, ponctuation, etc. 
+If you find any word or name mispellings, feel free to correct.
+
+{{input_text}}
--- a/prompts/patterns/common_text.jinja
+++ b/prompts/patterns/common_text.jinja
@ -0,0 +1,4 @@
+# ADDITIONAL INSTRUCTIONS
+
+- You are working on my editorial projects. The text below is my own. 
+- Please do not reply with any acknowledgements or greetings, just provide the content requested.
--- a/prompts/patterns/custom.jinja
+++ b/prompts/patterns/custom.jinja
@ -0,0 +1,9 @@
+{% include 'patterns/common_text.jinja' %}
+
+{{command}}
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/keyinsights.jinja
+++ b/prompts/patterns/keyinsights.jinja
@ -0,0 +1,30 @@
+
+{% include 'patterns/common_text.jinja' %}
+
+# IDENTITY and PURPOSE
+
+You extract surprising, powerful, and interesting insights from text content. You are interested in insights related to the purpose and meaning of life, human flourishing, the role of technology in the future of humanity, artificial intelligence and its affect on humans, memes, learning, reading, books, continuous improvement, and similar topics.
+You create 15 word bullet points that capture the most important insights from the input.
+Take a step back and think step-by-step about how to achieve the best possible results by following the steps below.
+
+# STEPS
+
+- Extract 20 to 50 of the most surprising, insightful, and/or interesting ideas from the input in a section called IDEAS, and write them on a virtual whiteboard in your mind using 15 word bullets. If there are less than 50 then collect all of them. Make sure you extract at least 20.
+
+- From those IDEAS, extract the most powerful and insightful of them and write them in a section called INSIGHTS. Make sure you extract at least 10 and up to 25.
+
+# OUTPUT INSTRUCTIONS
+
+- INSIGHTS are essentially higher-level IDEAS that are more abstracted and wise.
+- Output the INSIGHTS section only.
+- Each bullet should be about 15 words in length.
+- Do not give warnings or notes; only output the requested sections.
+- You use bulleted lists for output, not numbered lists.
+- Do not start items with the same opening words.
+- Ensure you follow ALL these instructions when creating your output.
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/makeitdense.jinja
+++ b/prompts/patterns/makeitdense.jinja
@ -0,0 +1,16 @@
+{% include 'patterns/common_text.jinja' %}
+
+# MISSION
+You are a Sparse Priming Representation (SPR) writer. An SPR is a particular kind of use of language for advanced NLP, NLU, and NLG tasks, particularly useful for the latest generation of Large Language Models (LLMs). You will be given information by the USER which you are to render as an SPR.
+
+# THEORY
+LLMs are a kind of deep neural network. They have been demonstrated to embed knowledge, abilities, and concepts, ranging from reasoning to planning, and even to theory of mind. These are called latent abilities and latent content, collectively referred to as latent space. The latent space of an LLM can be activated with the correct series of words as inputs, which will create a useful internal state of the neural network. This is not unlike how the right shorthand cues can prime a human mind to think in a certain way. Like human minds, LLMs are associative, meaning you only need to use the correct associations to "prime" another model to think in the same way.
+
+# METHODOLOGY
+Render the input as a distilled list of succinct statements, assertions, associations, concepts, analogies, and metaphors. The idea is to capture as much, conceptually, as possible but with as few words as possible. Write it in a way that makes sense to you, as the future audience will be another language model, not a human. Use complete sentences.
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/mermaid.jinja
+++ b/prompts/patterns/mermaid.jinja
@ -0,0 +1,29 @@
+# IDENTITY and PURPOSE
+
+You are an expert at data and concept visualization and in turning complex ideas into a form that can be visualized using Mermaid (markdown) syntax.
+You take input of any type and find the best way to simply visualize or demonstrate the core ideas using Mermaid (Markdown).
+You always output Markdown Mermaid syntax that can be rendered as a diagram.
+
+# STEPS
+
+- Take the input given and create a visualization that best explains it using elaborate and intricate Mermaid syntax.
+- Ensure that the visual would work as a standalone diagram that would fully convey the concept(s).
+- Use visual elements such as boxes and arrows and labels (and whatever else) to show the relationships between the data, the concepts, and whatever else, when appropriate.
+- Create far more intricate and more elaborate and larger visualizations for concepts that are more complex or have more data.
+- Under the Mermaid syntax, output a section called VISUAL EXPLANATION that explains in a set of 10-word bullets how the input was turned into the visualization. Ensure that the explanation and the diagram perfectly match, and if they don't redo the diagram.
+- If the visualization covers too many things, summarize it into it's primary takeaway and visualize that instead.
+- DO NOT COMPLAIN AND GIVE UP. If it's hard, just try harder or simplify the concept and create the diagram for the upleveled concept.
+
+# OUTPUT INSTRUCTIONS
+
+- DO NOT COMPLAIN. Just output the Mermaid syntax.
+- Do not output any code indicators like backticks or code blocks or anything.
+- Ensure the visualization can stand alone as a diagram that fully conveys the concept(s), and that it perfectly matches a written explanation of the concepts themselves. Start over if it can't.
+- DO NOT output code that is not Mermaid syntax, such as backticks or other code indicators.
+- Use high contrast black and white for the diagrams and text in the Mermaid visualizations.
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/reflection_questions.jinja
+++ b/prompts/patterns/reflection_questions.jinja
@ -0,0 +1,28 @@
+
+{% include 'patterns/common_text.jinja' %}
+
+# IDENTITY and PURPOSE
+
+You extract deep, thought-provoking, and meaningful reflections from text content. You are especially focused on themes related to the human experience, such as the purpose of life, personal growth, the intersection of technology and humanity, artificial intelligence's societal impact, human potential, collective evolution, and transformative learning. Your reflections aim to provoke new ways of thinking, challenge assumptions, and provide a thoughtful synthesis of the content.
+
+# STEPS
+
+- Extract 3 to 5 of the most profound, thought-provoking, and/or meaningful ideas from the input in a section called REFLECTIONS.
+- Each reflection should aim to explore underlying implications, connections to broader human experiences, or highlight a transformative perspective.
+- Take a step back and consider the deeper significance or questions that arise from the content.
+
+# OUTPUT INSTRUCTIONS
+
+- The output section should be labeled as REFLECTIONS.
+- Each bullet point should be between 20-25 words.
+- Avoid repetition in the phrasing and ensure variety in sentence structure.
+- The reflections should encourage deeper inquiry and provide a synthesis that transcends surface-level observations.
+- Use bullet points, not numbered lists.
+- Every bullet should be formatted as a question that elicits contemplation or a statement that offers a profound insight.
+- Do not give warnings or notes; only output the requested section.
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/summarize.jinja
+++ b/prompts/patterns/summarize.jinja
@ -0,0 +1,16 @@
+{% include 'patterns/common_text.jinja' %}
+
+# SYSTEM ROLE
+You are a content summarization assistant that creates dense, information-rich summaries optimized for machine understanding. Your summaries should capture key concepts with minimal words while maintaining complete, clear sentences.
+
+# TASK
+Analyze the provided content and create a summary that:
+- Captures the core concepts and key information
+- Uses clear, direct language
+- Maintains context from any previous summaries
+
+# INPUT
+
+{{input_text}}
+
+# OUTPUT
--- a/prompts/patterns/translate.jinja
+++ b/prompts/patterns/translate.jinja
@ -0,0 +1,6 @@
+{% include 'patterns/common_text.jinja' %}
+
+Please translate the following text to portuguese:
+
+
+{{input_text}}
--- a/stream_app/source.py
+++ b/stream_app/source.py
@ -2,13 +2,14 @@ from pathlib import Path

 import streamlit as st
 import streamlit_scrollable_textbox as stx  # type: ignore
+import yaml
 from humanize import naturaltime
 from loguru import logger
-from streamlit_tags import st_tags  # type: ignore

 from open_notebook.domain import Asset, Source
 from open_notebook.graphs.content_process import graph
-from open_notebook.utils import token_cost, token_count
+from open_notebook.graphs.multipattern import graph as transform_graph
+from open_notebook.utils import surreal_clean

 from .consts import context_icons

@ -16,50 +17,71 @@ uploads_dir = Path("./.uploads")
 uploads_dir.mkdir(parents=True, exist_ok=True)


+def run_transformations(input_text, transformations):
+    output = transform_graph.invoke(
+        dict(content_stack=[input_text], transformations=transformations)
+    )
+    return output["output"]
+
+
@st.dialog("Source", width="large")
 def source_panel(source_id):
    source: Source = Source.get(source_id)
    if not source:
        st.error("Source not found")
        return
-    title = st.empty()
-    if source.title:
-        title.subheader(source.title)
-    st.caption(f"Created {naturaltime(source.created)}")
-    # st.markdown(f"**URL:** {source.url}, **File:** {source.file_path}")
-    summary = st.empty()
-    for insight in source.insights:
-        summary.write(insight.insight_type)
-        summary.write(insight.content)

-    topics = source.topics or []
-    if len(topics) > 0:
-        st_tags(
-            label="",
-            text="Press enter to add more",
-            value=source.topics,
-            suggestions=source.topics,
-            maxtags=10,
-            key="1",
-        )
+    process_tab, source_tab = st.tabs(["Process", "Source"])
+    with process_tab:
+        c1, c2 = st.columns([3, 1])
+        with c1:
+            title = st.empty()
+            if source.title:
+                title.subheader(source.title)
+            if source.asset.url:
+                from_src = f"from URL: {source.asset.url}"
+            elif source.asset.file_path:
+                from_src = f"from file: {source.asset.file_path}"
+            else:
+                from_src = "from text"
+            st.caption(f"Created {naturaltime(source.created)}, {from_src}")
+            for insight in source.insights:
+                with st.expander(f"**{insight.insight_type}**"):
+                    st.markdown(insight.content)
+                    if st.button("Delete", key=f"delete_insight_{insight.id}"):
+                        insight.delete()
+                        st.rerun(scope="fragment")

-    if st.button("Delete", icon="🗑️"):
-        source.delete()
-        st.rerun()
+        with c2:
+            with open("transformations.yaml", "r") as file:
+                transformations = yaml.safe_load(file)
+                for transformation in transformations["source_insights"]:
+                    if st.button(
+                        transformation["name"], help=transformation["description"]
+                    ):
+                        result = run_transformations(
+                            source.full_text, transformation["transformations"]
+                        )
+                        source.add_insight(
+                            transformation["insight_type"], surreal_clean(result)
+                        )
+                        st.rerun(scope="fragment")

-    cost = token_cost(token_count(source.full_text)) * 1.2
-    if st.button(f"Summarize (about ${cost:.4f})", icon="📝"):
-        source.summarize()
-        st.rerun(scope="fragment")
+            if st.button(
+                "Embed vectors",
+                icon="🦾",
+                help="This will generate your embedding vectors on the database for powerful search capabilities",
+            ):
+                source.vectorize()
+                st.success("Embedding complete")

-    cost_embedding = token_cost(token_count(source.full_text), 0.02)
+            if st.button("Delete", icon="🗑️"):
+                source.delete()
+                st.rerun()

-    if st.button(f"Embed (${cost_embedding:.4f})", icon="📝"):
-        source.vectorize()
-        st.success("Embedding complete")
-
-    st.subheader("Content")
-    stx.scrollableTextbox(source.full_text, height=300)
+    with source_tab:
+        st.subheader("Content")
+        stx.scrollableTextbox(source.full_text, height=300)


@st.dialog("Add a Source", width="large")
@ -105,16 +127,14 @@ def add_source(session_id):
            st.write("Saving..")
            source = Source(
                asset=Asset(url=req.get("url"), file_path=req.get("file_path")),
+                full_text=surreal_clean(result["content"]),
            )
            source.save()
-            source.save_chunks(result["content"])
            source.add_to_notebook(st.session_state[session_id]["notebook"].id)
            st.write("Summarizing...")
            source.summarize()

        st.rerun()
-    # else:
-    #     st.stop()


 def source_card(session_id, source):
--- a/transformations.yaml
+++ b/transformations.yaml
@ -0,0 +1,35 @@
+
+source_insights:
+  - name: "Summarize"
+    insight_type: "Content Summary"
+    description: "Summarize the content"
+    transformations:
+      - patterns/makeitdense
+      - patterns/summarize
+  - name: "Key Insights"
+    insight_type: "Key Insights"
+    description: "Extracts a list of the Key Insights of the content"
+    transformations:
+      - patterns/keyinsights
+  - name: "Make it Dense"
+    insight_type: "Dense Representation"
+    description: "Create a dense representation of the content"
+    transformations:
+      - patterns/makeitdense
+  - name: "Analyze Paper"
+    insight_type: "Paper Analysis"
+    description: "Analyze the paper and provide a quick summary"
+    transformations:
+      - patterns/analyze_paper
+  - name: "Reflection"
+    insight_type: "Reflection Questions"
+    description: "Generates a list of insightful questions to provoke reflection"
+    transformations:
+      - patterns/reflection_questions
+  - name: "Reflection [PT]"
+    insight_type: "Reflection Questions [PT]"
+    description: "Generates a list of insightful questions to provoke reflection"
+    transformations:
+      - patterns/reflection_questions
+      - patterns/translate
+