better title acquisition

This commit is contained in:
LUIS NOVO 2024-10-24 15:53:03 -03:00
parent 356b2a5434
commit 8a5803a21a
3 changed files with 36 additions and 13 deletions

View file

@ -90,7 +90,6 @@ class ObjectModel(BaseModel):
def _prepare_save_data(self) -> Dict[str, Any]:
data = self.model_dump()
logger.debug(f"Preparing data for save: {data}")
del data["created"]
del data["updated"]
return {key: value for key, value in data.items() if value is not None}
@ -306,21 +305,22 @@ class Source(ObjectModel):
logger.error(f"Error adding insight to source {self.id}: {str(e)}")
raise DatabaseOperationError(e)
# todo: move this to content processing pipeline as a major graph
def generate_toc_and_title(self) -> "Source":
try:
config = RunnableConfig(configurable=dict(thread_id=self.id))
result = toc_graph.invoke({"content": self.full_text}, config=config)
logger.warning(result["toc"])
self.add_insight("Table of Contents", surreal_clean(result["toc"]))
transformations = [
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
]
output = pattern_graph.invoke(
dict(content_stack=[result["toc"]], transformations=transformations)
)
logger.warning(output["output"])
self.title = surreal_clean(output["output"])
self.save()
if not self.title:
transformations = [
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
]
output = pattern_graph.invoke(
dict(content_stack=[result["toc"]], transformations=transformations)
)
logger.warning(output["output"])
self.title = surreal_clean(output["output"])
self.save()
return self
except Exception as e:
logger.error(f"Error summarizing source {self.id}: {str(e)}")

View file

@ -4,6 +4,7 @@ import fitz # type: ignore
import magic
import requests # type: ignore
from langgraph.graph import END, START, StateGraph
from loguru import logger
from typing_extensions import TypedDict
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
from youtube_transcript_api.formatters import TextFormatter # type: ignore
@ -13,6 +14,7 @@ class SourceState(TypedDict):
content: str
file_path: str
url: str
title: str
source_type: str
identified_type: str
identified_provider: str
@ -97,7 +99,26 @@ def extract_url(state: SourceState):
Get the content of a URL
"""
response = requests.get(f"https://r.jina.ai/{state.get('url')}")
return {"content": response.text}
text = response.text
if text.startswith("Title:") and "\n" in text:
title_end = text.index("\n")
title = text[6:title_end].strip()
logger.debug(f"Content has title - {title}")
logger.debug(text[:100])
content = text[title_end + 1 :].strip()
return {"title": title, "content": content}
else:
logger.debug("Content does not have URL")
return {"content": text}
def _get_title(url):
"""
Get the content of a URL
"""
response = extract_url(dict(url=url))
if "title" in response:
return response["title"]
def extract_txt(state: SourceState):
@ -166,7 +187,8 @@ def extract_youtube_transcript(state: SourceState):
_extract_youtube_id(state.get("url")), languages=["pt", "en"]
)
formatter = TextFormatter()
return {"content": formatter.format_transcript(transcript)}
title = _get_title(state.get("url"))
return {"content": formatter.format_transcript(transcript), "title": title}
def should_continue(data: SourceState):

View file

@ -128,6 +128,7 @@ def add_source(session_id):
source = Source(
asset=Asset(url=req.get("url"), file_path=req.get("file_path")),
full_text=surreal_clean(result["content"]),
title=result.get("title"),
)
source.save()
source.add_to_notebook(st.session_state[session_id]["notebook"].id)