diff --git a/open_notebook/domain.py b/open_notebook/domain.py index 4011473..ff0b700 100644 --- a/open_notebook/domain.py +++ b/open_notebook/domain.py @@ -90,7 +90,6 @@ class ObjectModel(BaseModel): def _prepare_save_data(self) -> Dict[str, Any]: data = self.model_dump() - logger.debug(f"Preparing data for save: {data}") del data["created"] del data["updated"] return {key: value for key, value in data.items() if value is not None} @@ -306,21 +305,22 @@ class Source(ObjectModel): logger.error(f"Error adding insight to source {self.id}: {str(e)}") raise DatabaseOperationError(e) + # todo: move this to content processing pipeline as a major graph def generate_toc_and_title(self) -> "Source": try: config = RunnableConfig(configurable=dict(thread_id=self.id)) result = toc_graph.invoke({"content": self.full_text}, config=config) - logger.warning(result["toc"]) self.add_insight("Table of Contents", surreal_clean(result["toc"])) - transformations = [ - "Based on the Table of Contents below, please provide a Title for this content, with max 15 words" - ] - output = pattern_graph.invoke( - dict(content_stack=[result["toc"]], transformations=transformations) - ) - logger.warning(output["output"]) - self.title = surreal_clean(output["output"]) - self.save() + if not self.title: + transformations = [ + "Based on the Table of Contents below, please provide a Title for this content, with max 15 words" + ] + output = pattern_graph.invoke( + dict(content_stack=[result["toc"]], transformations=transformations) + ) + logger.warning(output["output"]) + self.title = surreal_clean(output["output"]) + self.save() return self except Exception as e: logger.error(f"Error summarizing source {self.id}: {str(e)}") diff --git a/open_notebook/graphs/content_process.py b/open_notebook/graphs/content_process.py index 9534db0..7d1eea9 100644 --- a/open_notebook/graphs/content_process.py +++ b/open_notebook/graphs/content_process.py @@ -4,6 +4,7 @@ import fitz # type: ignore import magic import requests # type: ignore from langgraph.graph import END, START, StateGraph +from loguru import logger from typing_extensions import TypedDict from youtube_transcript_api import YouTubeTranscriptApi # type: ignore from youtube_transcript_api.formatters import TextFormatter # type: ignore @@ -13,6 +14,7 @@ class SourceState(TypedDict): content: str file_path: str url: str + title: str source_type: str identified_type: str identified_provider: str @@ -97,7 +99,26 @@ def extract_url(state: SourceState): Get the content of a URL """ response = requests.get(f"https://r.jina.ai/{state.get('url')}") - return {"content": response.text} + text = response.text + if text.startswith("Title:") and "\n" in text: + title_end = text.index("\n") + title = text[6:title_end].strip() + logger.debug(f"Content has title - {title}") + logger.debug(text[:100]) + content = text[title_end + 1 :].strip() + return {"title": title, "content": content} + else: + logger.debug("Content does not have URL") + return {"content": text} + + +def _get_title(url): + """ + Get the content of a URL + """ + response = extract_url(dict(url=url)) + if "title" in response: + return response["title"] def extract_txt(state: SourceState): @@ -166,7 +187,8 @@ def extract_youtube_transcript(state: SourceState): _extract_youtube_id(state.get("url")), languages=["pt", "en"] ) formatter = TextFormatter() - return {"content": formatter.format_transcript(transcript)} + title = _get_title(state.get("url")) + return {"content": formatter.format_transcript(transcript), "title": title} def should_continue(data: SourceState): diff --git a/stream_app/source.py b/stream_app/source.py index bd30713..b9ebb4b 100644 --- a/stream_app/source.py +++ b/stream_app/source.py @@ -128,6 +128,7 @@ def add_source(session_id): source = Source( asset=Asset(url=req.get("url"), file_path=req.get("file_path")), full_text=surreal_clean(result["content"]), + title=result.get("title"), ) source.save() source.add_to_notebook(st.session_state[session_id]["notebook"].id)