better title acquisition
This commit is contained in:
parent
356b2a5434
commit
8a5803a21a
3 changed files with 36 additions and 13 deletions
|
|
@ -90,7 +90,6 @@ class ObjectModel(BaseModel):
|
|||
|
||||
def _prepare_save_data(self) -> Dict[str, Any]:
|
||||
data = self.model_dump()
|
||||
logger.debug(f"Preparing data for save: {data}")
|
||||
del data["created"]
|
||||
del data["updated"]
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
|
@ -306,21 +305,22 @@ class Source(ObjectModel):
|
|||
logger.error(f"Error adding insight to source {self.id}: {str(e)}")
|
||||
raise DatabaseOperationError(e)
|
||||
|
||||
# todo: move this to content processing pipeline as a major graph
|
||||
def generate_toc_and_title(self) -> "Source":
|
||||
try:
|
||||
config = RunnableConfig(configurable=dict(thread_id=self.id))
|
||||
result = toc_graph.invoke({"content": self.full_text}, config=config)
|
||||
logger.warning(result["toc"])
|
||||
self.add_insight("Table of Contents", surreal_clean(result["toc"]))
|
||||
transformations = [
|
||||
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
|
||||
]
|
||||
output = pattern_graph.invoke(
|
||||
dict(content_stack=[result["toc"]], transformations=transformations)
|
||||
)
|
||||
logger.warning(output["output"])
|
||||
self.title = surreal_clean(output["output"])
|
||||
self.save()
|
||||
if not self.title:
|
||||
transformations = [
|
||||
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
|
||||
]
|
||||
output = pattern_graph.invoke(
|
||||
dict(content_stack=[result["toc"]], transformations=transformations)
|
||||
)
|
||||
logger.warning(output["output"])
|
||||
self.title = surreal_clean(output["output"])
|
||||
self.save()
|
||||
return self
|
||||
except Exception as e:
|
||||
logger.error(f"Error summarizing source {self.id}: {str(e)}")
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import fitz # type: ignore
|
|||
import magic
|
||||
import requests # type: ignore
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from loguru import logger
|
||||
from typing_extensions import TypedDict
|
||||
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
||||
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
||||
|
|
@ -13,6 +14,7 @@ class SourceState(TypedDict):
|
|||
content: str
|
||||
file_path: str
|
||||
url: str
|
||||
title: str
|
||||
source_type: str
|
||||
identified_type: str
|
||||
identified_provider: str
|
||||
|
|
@ -97,7 +99,26 @@ def extract_url(state: SourceState):
|
|||
Get the content of a URL
|
||||
"""
|
||||
response = requests.get(f"https://r.jina.ai/{state.get('url')}")
|
||||
return {"content": response.text}
|
||||
text = response.text
|
||||
if text.startswith("Title:") and "\n" in text:
|
||||
title_end = text.index("\n")
|
||||
title = text[6:title_end].strip()
|
||||
logger.debug(f"Content has title - {title}")
|
||||
logger.debug(text[:100])
|
||||
content = text[title_end + 1 :].strip()
|
||||
return {"title": title, "content": content}
|
||||
else:
|
||||
logger.debug("Content does not have URL")
|
||||
return {"content": text}
|
||||
|
||||
|
||||
def _get_title(url):
|
||||
"""
|
||||
Get the content of a URL
|
||||
"""
|
||||
response = extract_url(dict(url=url))
|
||||
if "title" in response:
|
||||
return response["title"]
|
||||
|
||||
|
||||
def extract_txt(state: SourceState):
|
||||
|
|
@ -166,7 +187,8 @@ def extract_youtube_transcript(state: SourceState):
|
|||
_extract_youtube_id(state.get("url")), languages=["pt", "en"]
|
||||
)
|
||||
formatter = TextFormatter()
|
||||
return {"content": formatter.format_transcript(transcript)}
|
||||
title = _get_title(state.get("url"))
|
||||
return {"content": formatter.format_transcript(transcript), "title": title}
|
||||
|
||||
|
||||
def should_continue(data: SourceState):
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ def add_source(session_id):
|
|||
source = Source(
|
||||
asset=Asset(url=req.get("url"), file_path=req.get("file_path")),
|
||||
full_text=surreal_clean(result["content"]),
|
||||
title=result.get("title"),
|
||||
)
|
||||
source.save()
|
||||
source.add_to_notebook(st.session_state[session_id]["notebook"].id)
|
||||
|
|
|
|||
Loading…
Reference in a new issue