separate source and content graph
This commit is contained in:
parent
b42a95b35f
commit
2e2a4947b3
12 changed files with 167 additions and 56 deletions
19
open_notebook/domain/transformation.py
Normal file
19
open_notebook/domain/transformation.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from typing import ClassVar, List, Optional
|
||||
|
||||
import yaml
|
||||
from pydantic import Field
|
||||
|
||||
from open_notebook.domain.base import RecordModel
|
||||
|
||||
|
||||
class Transformation:
|
||||
@classmethod
|
||||
def get_all(cls):
|
||||
with open("transformations.yaml", "r") as file:
|
||||
transformations = yaml.safe_load(file)
|
||||
return transformations
|
||||
|
||||
|
||||
class DefaultTransformations(RecordModel):
|
||||
record_id: ClassVar[str] = "open_notebook:default_transformations"
|
||||
source_insights: Optional[List[str]] = Field(default_factory=list)
|
||||
|
|
@ -14,14 +14,14 @@ from open_notebook.graphs.content_processing.pdf import (
|
|||
SUPPORTED_FITZ_TYPES,
|
||||
extract_pdf,
|
||||
)
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
from open_notebook.graphs.content_processing.text import extract_txt
|
||||
from open_notebook.graphs.content_processing.url import extract_url, url_provider
|
||||
from open_notebook.graphs.content_processing.video import extract_best_audio_from_video
|
||||
from open_notebook.graphs.content_processing.youtube import extract_youtube_transcript
|
||||
|
||||
|
||||
def source_identification(state: SourceState):
|
||||
def source_identification(state: ContentState):
|
||||
"""
|
||||
Identify the content source based on parameters
|
||||
"""
|
||||
|
|
@ -37,7 +37,7 @@ def source_identification(state: SourceState):
|
|||
return {"source_type": doc_type}
|
||||
|
||||
|
||||
def file_type(state: SourceState):
|
||||
def file_type(state: ContentState):
|
||||
"""
|
||||
Identify the file using python-magic
|
||||
"""
|
||||
|
|
@ -45,10 +45,11 @@ def file_type(state: SourceState):
|
|||
file_path = state.get("file_path")
|
||||
if file_path is not None:
|
||||
return_dict["identified_type"] = magic.from_file(file_path, mime=True)
|
||||
return_dict["title"] = os.path.basename(file_path)
|
||||
return return_dict
|
||||
|
||||
|
||||
def file_type_edge(data: SourceState):
|
||||
def file_type_edge(data: ContentState):
|
||||
assert data.get("identified_type"), "Type not identified"
|
||||
identified_type = data["identified_type"]
|
||||
|
||||
|
|
@ -68,7 +69,7 @@ def file_type_edge(data: SourceState):
|
|||
)
|
||||
|
||||
|
||||
def delete_file(data: SourceState):
|
||||
def delete_file(data: ContentState):
|
||||
if data.get("delete_source"):
|
||||
logger.debug(f"Deleting file: {data.get('file_path')}")
|
||||
file_path = data.get("file_path")
|
||||
|
|
@ -82,7 +83,7 @@ def delete_file(data: SourceState):
|
|||
logger.debug("Not deleting file")
|
||||
|
||||
|
||||
workflow = StateGraph(SourceState)
|
||||
workflow = StateGraph(ContentState)
|
||||
workflow.add_node("source", source_identification)
|
||||
workflow.add_node("url_provider", url_provider)
|
||||
workflow.add_node("file_type", file_type)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from loguru import logger
|
|||
from pydub import AudioSegment
|
||||
|
||||
from open_notebook.domain.models import model_manager
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
# todo: remove reference to model_manager
|
||||
# future: parallelize the transcription process
|
||||
|
|
@ -72,7 +72,7 @@ def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
|
|||
return output_files
|
||||
|
||||
|
||||
def extract_audio(data: SourceState):
|
||||
def extract_audio(data: ContentState):
|
||||
SPEECH_TO_TEXT_MODEL = model_manager.speech_to_text
|
||||
|
||||
input_audio_path = data.get("file_path")
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from loguru import logger
|
|||
from openpyxl import load_workbook
|
||||
from pptx import Presentation
|
||||
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
SUPPORTED_OFFICE_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
|
|
@ -251,7 +251,7 @@ def get_xlsx_info(file_path):
|
|||
return None
|
||||
|
||||
|
||||
def extract_office_content(state: SourceState):
|
||||
def extract_office_content(state: ContentState):
|
||||
"""Universal function to extract content from Office files"""
|
||||
assert state.get("file_path"), "No file path provided"
|
||||
assert (
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import unicodedata
|
|||
import fitz # type: ignore
|
||||
from loguru import logger
|
||||
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
# todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
|
||||
# todo: what else can we do to make the text more readable?
|
||||
|
|
@ -127,7 +127,7 @@ def _extract_text_from_pdf(pdf_path):
|
|||
doc.close()
|
||||
|
||||
|
||||
def extract_pdf(state: SourceState):
|
||||
def extract_pdf(state: ContentState):
|
||||
"""
|
||||
Parse the text file and print its content.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class SourceState(TypedDict):
|
||||
class ContentState(TypedDict):
|
||||
content: str
|
||||
file_path: str
|
||||
url: str
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
from loguru import logger
|
||||
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
|
||||
def extract_txt(state: SourceState):
|
||||
def extract_txt(state: ContentState):
|
||||
"""
|
||||
Parse the text file and print its content.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -5,14 +5,14 @@ import requests # type: ignore
|
|||
from bs4 import BeautifulSoup, Comment
|
||||
from loguru import logger
|
||||
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
# future: better extraction methods
|
||||
# https://github.com/buriy/python-readability
|
||||
# also try readability: from readability import Document
|
||||
|
||||
|
||||
def url_provider(state: SourceState):
|
||||
def url_provider(state: ContentState):
|
||||
"""
|
||||
Identify the provider
|
||||
"""
|
||||
|
|
@ -173,7 +173,7 @@ def extract_url_jina(url: str):
|
|||
return {"content": text}
|
||||
|
||||
|
||||
def extract_url(state: SourceState):
|
||||
def extract_url(state: ContentState):
|
||||
assert state.get("url"), "No URL provided"
|
||||
url = state["url"]
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import subprocess
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
|
||||
def extract_audio_from_video(input_file, output_file, stream_index):
|
||||
|
|
@ -102,7 +102,7 @@ def select_best_audio_stream(streams):
|
|||
return max(scored_streams, key=lambda x: x[0])[1]
|
||||
|
||||
|
||||
def extract_best_audio_from_video(data: SourceState):
|
||||
def extract_best_audio_from_video(data: ContentState):
|
||||
"""
|
||||
Main function to extract the best audio stream from a video file
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|||
|
||||
from open_notebook.config import CONFIG
|
||||
from open_notebook.exceptions import NoTranscriptFound
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|||
return None
|
||||
|
||||
|
||||
def extract_youtube_transcript(state: SourceState):
|
||||
def extract_youtube_transcript(state: ContentState):
|
||||
"""
|
||||
Parse the text file and print its content.
|
||||
"""
|
||||
|
|
|
|||
106
open_notebook/graphs/source.py
Normal file
106
open_notebook/graphs/source.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
import operator
|
||||
from typing import List
|
||||
|
||||
from langchain_core.runnables import (
|
||||
RunnableConfig,
|
||||
)
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from langgraph.types import Send
|
||||
from loguru import logger
|
||||
from typing_extensions import Annotated, TypedDict
|
||||
|
||||
from open_notebook.domain.notebook import Asset, Source
|
||||
from open_notebook.domain.transformation import Transformation
|
||||
from open_notebook.graphs.content_processing import ContentState
|
||||
from open_notebook.graphs.content_processing import graph as content_graph
|
||||
from open_notebook.graphs.multipattern import graph as transform_graph
|
||||
from open_notebook.utils import surreal_clean
|
||||
|
||||
# todo: we can make this more efficient
|
||||
|
||||
|
||||
class SourceState(TypedDict):
|
||||
content_state: ContentState
|
||||
transformations: List[str]
|
||||
notebook_id: str
|
||||
source: Source
|
||||
transformations: Annotated[list, operator.add]
|
||||
|
||||
|
||||
class TransformationState(TypedDict):
|
||||
source: Source
|
||||
transformation: dict
|
||||
|
||||
|
||||
def content_process(state: SourceState):
|
||||
content_state = state["content_state"]
|
||||
logger.debug("Content processing started for new content")
|
||||
return {"content_state": content_graph.invoke(content_state)}
|
||||
|
||||
|
||||
def run_patterns(input_text, patterns):
|
||||
output = transform_graph.invoke(dict(content_stack=[input_text], patterns=patterns))
|
||||
return output["output"]
|
||||
|
||||
|
||||
def save_source(state: SourceState):
|
||||
logger.debug("Saving source")
|
||||
content_state = state["content_state"]
|
||||
source = Source(
|
||||
asset=Asset(
|
||||
url=content_state.get("url"), file_path=content_state.get("file_path")
|
||||
),
|
||||
full_text=surreal_clean(content_state["content"]),
|
||||
title=content_state.get("title"),
|
||||
)
|
||||
source.save()
|
||||
|
||||
if state["notebook_id"]:
|
||||
logger.debug(f"Adding source to notebook {state['notebook_id']}")
|
||||
source.add_to_notebook(state["notebook_id"])
|
||||
return {"source": source}
|
||||
|
||||
|
||||
def trigger_transformations(state: SourceState, config: RunnableConfig):
|
||||
if len(state["transformations"]) == 0:
|
||||
return []
|
||||
transformations = Transformation.get_all()
|
||||
to_apply = [
|
||||
t
|
||||
for t in transformations["source_insights"]
|
||||
if t["name"] in state["transformations"]
|
||||
]
|
||||
logger.debug(f"Applying transformations {to_apply}")
|
||||
return [
|
||||
Send(
|
||||
"transform_content",
|
||||
{
|
||||
"source": state["source"],
|
||||
"transformation": t,
|
||||
},
|
||||
)
|
||||
for t in to_apply
|
||||
]
|
||||
|
||||
|
||||
def transform_content(state: TransformationState):
|
||||
source = state["source"]
|
||||
content = source.full_text
|
||||
transformation = state["transformation"]
|
||||
logger.debug(f"Applying transformation {transformation['name']}")
|
||||
result = run_patterns(content, patterns=transformation["patterns"])
|
||||
source.add_insight(transformation["name"], surreal_clean(result))
|
||||
return {"transformations": [{"name": transformation["name"], "content": result}]}
|
||||
|
||||
|
||||
workflow = StateGraph(SourceState)
|
||||
workflow.add_node("content_process", content_process)
|
||||
workflow.add_node("save_source", save_source)
|
||||
workflow.add_node("transform_content", transform_content)
|
||||
workflow.add_edge(START, "content_process")
|
||||
workflow.add_edge("content_process", "save_source")
|
||||
workflow.add_conditional_edges(
|
||||
"save_source", trigger_transformations, ["transform_content"]
|
||||
)
|
||||
workflow.add_edge("transform_content", END)
|
||||
source_graph = workflow.compile()
|
||||
|
|
@ -6,36 +6,15 @@ from humanize import naturaltime
|
|||
from loguru import logger
|
||||
|
||||
from open_notebook.config import UPLOADS_FOLDER
|
||||
from open_notebook.domain.notebook import Asset, Source
|
||||
from open_notebook.domain.notebook import Source
|
||||
from open_notebook.domain.transformation import DefaultTransformations, Transformation
|
||||
from open_notebook.exceptions import UnsupportedTypeException
|
||||
from open_notebook.graphs.content_processing import graph
|
||||
from open_notebook.utils import surreal_clean
|
||||
from open_notebook.graphs.source import source_graph
|
||||
from pages.components import source_panel
|
||||
from pages.stream_app.utils import run_patterns
|
||||
|
||||
from .consts import context_icons
|
||||
|
||||
|
||||
# moved it here to replace it with the pipeline on 0.1.0
|
||||
def generate_toc_and_title(source) -> "Source":
|
||||
try:
|
||||
patterns = ["patterns/default/toc"]
|
||||
result = run_patterns(source.full_text, patterns=patterns)
|
||||
source.add_insight("Table of Contents", surreal_clean(result))
|
||||
if not source.title:
|
||||
patterns = [
|
||||
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
|
||||
]
|
||||
output = run_patterns(result, patterns=patterns)
|
||||
source.title = surreal_clean(output)
|
||||
source.save()
|
||||
return source
|
||||
except Exception as e:
|
||||
logger.error(f"Error summarizing source {source.id}: {str(e)}")
|
||||
logger.exception(e)
|
||||
raise
|
||||
|
||||
|
||||
@st.dialog("Source", width="large")
|
||||
def source_panel_dialog(source_id):
|
||||
source_panel(source_id, modal=True)
|
||||
|
|
@ -48,6 +27,7 @@ def add_source(notebook_id):
|
|||
source_text = None
|
||||
source_type = st.radio("Type", ["Link", "Upload", "Text"])
|
||||
req = {}
|
||||
transformations = Transformation.get_all()
|
||||
if source_type == "Link":
|
||||
source_link = st.text_input("Link")
|
||||
req["url"] = source_link
|
||||
|
|
@ -58,6 +38,14 @@ def add_source(notebook_id):
|
|||
else:
|
||||
source_text = st.text_area("Text")
|
||||
req["content"] = source_text
|
||||
|
||||
default_transformations = [t for t in DefaultTransformations().source_insights]
|
||||
available_transformations = [t["name"] for t in transformations["source_insights"]]
|
||||
apply_transformations = st.multiselect(
|
||||
"Apply transformations",
|
||||
options=available_transformations,
|
||||
default=default_transformations,
|
||||
)
|
||||
if st.button("Process", key="add_source"):
|
||||
logger.debug("Adding source")
|
||||
with st.status("Processing...", expanded=True):
|
||||
|
|
@ -82,17 +70,14 @@ def add_source(notebook_id):
|
|||
with open(new_path, "wb") as f:
|
||||
f.write(source_file.getbuffer())
|
||||
|
||||
result = graph.invoke(req)
|
||||
st.write("Saving..")
|
||||
source = Source(
|
||||
asset=Asset(url=req.get("url"), file_path=req.get("file_path")),
|
||||
full_text=surreal_clean(result["content"]),
|
||||
title=result.get("title"),
|
||||
st.write("Processing content..")
|
||||
source_graph.invoke(
|
||||
{
|
||||
"content_state": req,
|
||||
"notebook_id": notebook_id,
|
||||
"transformations": apply_transformations,
|
||||
}
|
||||
)
|
||||
source.save()
|
||||
source.add_to_notebook(notebook_id)
|
||||
st.write("Summarizing...")
|
||||
generate_toc_and_title(source)
|
||||
except UnsupportedTypeException as e:
|
||||
st.warning(
|
||||
"This type of content is not supported yet. If you think it should be, let us know on the project Issues's page"
|
||||
|
|
|
|||
Loading…
Reference in a new issue