WIP: podcast feature

This commit is contained in:
LUIS NOVO 2024-10-24 13:25:01 -03:00
parent 10ec265a44
commit 177d2c2b93
4 changed files with 686 additions and 28 deletions

View file

@ -0,0 +1,179 @@
"""
Website Extractor Module
This module is responsible for extracting clean text content from websites using
BeautifulSoup for local HTML parsing instead of the Jina AI API.
"""
import html
import logging
import re
from typing import List
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from podcastfy.utils.config import load_config
logger = logging.getLogger(__name__)
class WebsiteExtractor:
def __init__(self):
"""
Initialize the WebsiteExtractor.
"""
self.config = load_config()
self.website_extractor_config = self.config.get("website_extractor", {})
self.unwanted_tags = self.website_extractor_config.get("unwanted_tags", [])
self.user_agent = self.website_extractor_config.get("user_agent", "Mozilla/5.0")
self.timeout = self.website_extractor_config.get("timeout", 10)
self.remove_patterns = self.website_extractor_config.get(
"markdown_cleaning", {}
).get("remove_patterns", [])
def extract_content(self, url: str) -> str:
"""
Extract clean text content from a website using BeautifulSoup.
Args:
url (str): Website URL.
Returns:
str: Extracted clean text content.
Raises:
Exception: If there's an error in extracting the content.
"""
try:
# Normalize the URL
normalized_url = self.normalize_url(url)
# Request the webpage
headers = {"User-Agent": self.user_agent}
response = requests.get(
normalized_url, headers=headers, timeout=self.timeout
)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Remove unwanted elements
self.remove_unwanted_elements(soup)
# Extract and clean the text content
raw_text = soup.get_text(separator="\n") # Get all text content
cleaned_content = self.clean_content(raw_text)
return cleaned_content
except requests.RequestException as e:
logger.error(f"Failed to extract content from {url}: {str(e)}")
raise Exception(f"Failed to extract content from {url}: {str(e)}")
except Exception as e:
logger.error(
f"An unexpected error occurred while extracting content from {url}: {str(e)}"
)
raise Exception(
f"An unexpected error occurred while extracting content from {url}: {str(e)}"
)
def normalize_url(self, url: str) -> str:
"""
Normalize the given URL by adding scheme if missing and ensuring it's a valid URL.
Args:
url (str): The URL to normalize.
Returns:
str: The normalized URL.
Raises:
ValueError: If the URL is invalid after normalization attempts.
"""
# If the URL doesn't start with a scheme, add 'https://'
if not url.startswith(("http://", "https://")):
url = "https://" + url
# Parse the URL
parsed = urlparse(url)
# Ensure the URL has a valid scheme and netloc
if not all([parsed.scheme, parsed.netloc]):
raise ValueError(f"Invalid URL: {url}")
return parsed.geturl()
def remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
"""
Remove unwanted elements from the BeautifulSoup object.
Args:
soup (BeautifulSoup): The BeautifulSoup object to clean.
"""
for tag in self.unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
def clean_content(self, content: str) -> str:
"""
Clean the extracted content by removing unnecessary whitespace and applying
custom cleaning patterns.
Args:
content (str): The content to clean.
Returns:
str: Cleaned text content.
"""
# Decode HTML entities
cleaned_content = html.unescape(content)
# Remove extra whitespace
cleaned_content = re.sub(r"\s+", " ", cleaned_content)
# Remove extra newlines
cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content)
# Apply custom cleaning patterns from config
for pattern in self.remove_patterns:
cleaned_content = re.sub(pattern, "", cleaned_content)
return cleaned_content.strip()
def main(seed: int = 42) -> None:
"""
Main function to test the WebsiteExtractor class.
"""
logging.basicConfig(level=logging.INFO)
# Create an instance of WebsiteExtractor
extractor = WebsiteExtractor()
# Test URLs
test_urls: List[str] = [
"www.souzatharsis.com",
"https://en.wikipedia.org/wiki/Web_scraping",
]
for url in test_urls:
try:
logger.info(f"Extracting content from: {url}")
content = extractor.extract_content(url)
# Print the first 500 characters of the extracted content
logger.info(
f"Extracted content (first 500 characters):\n{content[:500]}..."
)
# Print the total length of the extracted content
logger.info(f"Total length of extracted content: {len(content)} characters")
logger.info("-" * 50)
except Exception as e:
logger.error(f"An error occurred while processing {url}: {str(e)}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,307 @@
from typing import ClassVar, List, Literal
from pydantic import Field, field_validator
from open_notebook.domain import ObjectModel
class PodcastEpisode(ObjectModel):
table_name: ClassVar[str] = "podcast_episode"
name: str
template: str
instructions: str
file_path: str
class PodcastConfig(ObjectModel):
table_name: ClassVar[str] = "podcast_config"
name: str
podcast_name: str
podcast_tagline: str
output_language: str = Field(default="English")
person1_role: str
person2_role: str
conversation_style: List[str]
engagement_technique: List[str]
dialogue_structure: List[str]
wordcount: int = Field(gt=500, lt=10000)
creativity: float = Field(ge=0, le=1)
provider: Literal["openai", "elevenlabs", "edge"] = Field(default="openai")
voice1: str
voice2: str
model: str
@field_validator("wordcount")
def validate_wordcount(cls, value):
if not 500 <= value <= 6000:
raise ValueError("Wordcount must be between 500 and 10000")
return value
@field_validator("creativity")
def validate_creativity(cls, value):
if not 0 <= value <= 1:
raise ValueError("Creativity must be between 0 and 1")
return value
conversation_styles = [
"Analytical",
"Argumentative",
"Informative",
"Humorous",
"Casual",
"Formal",
"Inspirational",
"Debate-style",
"Interview-style",
"Storytelling",
"Reflective",
"Narrative",
"Satirical",
"Educational",
"Conversational",
"Critical",
"Empathetic",
"Philosophical",
"Speculative",
"Motivational",
"Fun",
"Technical",
"Light-hearted",
"Serious",
"Investigative",
"Debunking",
"Collaborative",
"Didactic",
"Thought-provoking",
"Controversial",
"Skeptical",
"Optimistic",
"Pessimistic",
"Objective",
"Subjective",
"Sarcastic",
"Emotional",
"Exploratory",
"Friendly",
"Fast-paced",
"Slow-paced",
"Introspective",
"Open-ended",
"Affirmative",
"Dissenting",
]
# Dialogue Structures
dialogue_structures = [
"Topic Introduction",
"Opening Monologue",
"Guest Introduction",
"Icebreakers",
"Historical Context",
"Defining Terms",
"Problem Statement",
"Overview of the Issue",
"Deep Dive into Subtopics",
"Pro Arguments",
"Con Arguments",
"Cross-examination",
"Rebuttal",
"Expert Interviews",
"Panel Discussion",
"Case Studies",
"Myth Busting",
"Debunking Misconceptions",
"Audience Questions",
"Q&A Session",
"Listener Feedback",
"Rapid-fire Questions",
"Summary of Key Points",
"Recap",
"Key Takeaways",
"Actionable Tips",
"Call to Action",
"Future Outlook",
"Teaser for Next Episode",
"Closing Remarks",
"Thank You and Credits",
"Outtakes or Bloopers",
"Sponsor Messages",
"Social Media Shout-outs",
"Resource Recommendations",
"Feedback Request",
"Lightning Round",
"Behind-the-Scenes Insights",
"Ethical Considerations",
"Fact-checking Segment",
"Trending Topics",
"Closing Inspirational Quote",
"Final Reflections",
"Debrief",
"Farewell Messages",
"Next Episode Preview",
"Live Reactions",
"Call-in Segment",
"Acknowledgements",
"Transition Segments",
"Break Segments",
]
# Podcast Participant Roles
participant_roles = [
"Main Summarizer",
"Questioner/Clarifier",
"Optimist",
"Skeptic",
"Specialist",
"Thesis Presenter",
"Counterargument Provider",
"Professor",
"Student",
"Moderator",
"Host",
"Co-host",
"Expert Guest",
"Novice",
"Devil's Advocate",
"Analyst",
"Storyteller",
"Fact-checker",
"Comedian",
"Interviewer",
"Interviewee",
"Historian",
"Visionary",
"Strategist",
"Critic",
"Enthusiast",
"Mediator",
"Commentator",
"Researcher",
"Reporter",
"Advocate",
"Influencer",
"Observer",
"Listener",
"Facilitator",
"Innovator",
"Debater",
"Educator",
"Motivator",
"Narrator",
"Explorer",
"Opponent",
"Proponent",
"Philosopher",
"Engineer",
"Doctor",
"Psychologist",
"Economist",
"Politician",
"Scientist",
"Entrepreneur",
"Artist",
"Author",
"Journalist",
"Activist",
"Challenger",
"Supporter",
"Mentor",
"Mentee",
"Panelist",
"Audience Representative",
"Case Study Presenter",
"Data Analyst",
"Ethicist",
"Cultural Critic",
"Technologist",
"Environmentalist",
"Legal Expert",
"Healthcare Professional",
"Financial Advisor",
"Policy Maker",
"Sociologist",
"Anthropologist",
"Myth Buster",
"Trend Analyst",
"Futurist",
"Negotiator",
"Community Leader",
"Voice of Reason",
"Conflict Resolver",
"Emotional Support",
"Pragmatist",
"Idealist",
"Realist",
"Satirist",
"Story Analyst",
"Language Expert",
"Historical Witness",
"Survivor",
"Inspirational Figure",
"Cultural Ambassador",
"Digital Nomad",
"Remote Correspondent",
"Field Reporter",
"Data Scientist",
"Gamer",
"Musician",
"Filmmaker",
]
# Engagement Techniques
engagement_techniques = [
"Rhetorical Questions",
"Anecdotes",
"Analogies",
"Humor",
"Metaphors",
"Storytelling",
"Quizzes",
"Polls",
"Contests/Giveaways",
"Guest Appearances",
"Sound Effects",
"Music Interludes",
"Shout-outs",
"Interactive Challenges",
"Personal Testimonials",
"Quotes",
"Jokes",
"Surprise Elements",
"Emotional Appeals",
"Provocative Statements",
"Irony",
"Sarcasm",
"Alliteration",
"Repetition",
"Foreshadowing",
"Cliffhangers",
"Audience Participation",
"Sensory Descriptions",
"Visual Aids (if applicable)",
"Callbacks to Earlier Points",
"Pop Culture References",
"Hyperbole",
"Parables",
"Thought Experiments",
"Puzzles and Riddles",
"Role-playing",
"Mock Scenarios",
"Debates",
"Sound Bites",
"Catchphrases",
"Voice Modulation",
"Interactive Games",
"Live Demos",
"Behind-the-Scenes Insights",
"Vivid Imagery",
"Statistics and Facts",
"Open-ended Questions",
"Challenges to Assumptions",
"Evoking Curiosity",
"Memes (if visual components are included)",
"Surveys",
"Testimonials",
"Provocations",
]

150
pages/5_🎙️_Podcasts.py Normal file
View file

@ -0,0 +1,150 @@
import streamlit as st
from streamlit_tags import st_tags
from open_notebook.plugins.podcasts import (
PodcastConfig,
PodcastEpisode,
conversation_styles,
dialogue_structures,
engagement_techniques,
participant_roles,
)
episodes_tab, templates_tab = st.tabs(["Episodes", "Templates"])
with episodes_tab:
episodes = PodcastEpisode.get_all()
for episode in episodes:
st.json(episode.model_dump())
else:
st.write("No episodes yet")
with templates_tab:
st.subheader("Podcast Templates")
st.markdown("")
with st.expander("**Create new Template**"):
pd_cfg = {}
pd_cfg["name"] = st.text_input("Template Name")
pd_cfg["podcast_name"] = st.text_input("Podcast Name")
pd_cfg["podcast_tagline"] = st.text_input("Podcast Tagline")
pd_cfg["output_language"] = st.text_input("Language", value="English")
pd_cfg["person1_role"] = st.text_input("Person 1 role")
st.caption(f"Suggestions:{', '.join(participant_roles)}")
pd_cfg["person2_role"] = st.text_input("Person 2 role")
pd_cfg["conversation_style"] = st_tags(
["a"], conversation_styles, "Conversation Style"
)
st.caption(f"Suggestions:{', '.join(conversation_styles)}")
pd_cfg["engagement_technique"] = st_tags(
[], engagement_techniques, "Engagement Techniques"
)
st.caption(f"Suggestions:{', '.join(engagement_techniques)}")
pd_cfg["dialogue_structure"] = st_tags(
[], dialogue_structures, "Dialogue Structure"
)
st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
pd_cfg["wordcount"] = st.slider(
"Word Count", min_value=400, max_value=6000, step=50
)
pd_cfg["creativity"] = st.slider(
"Creativity", min_value=0.0, max_value=1.0, step=0.05
)
pd_cfg["provider"] = st.selectbox("Provider", ["openai", "elevenlabs", "edge"])
pd_cfg["voice1"] = st.text_input("Voice 1")
pd_cfg["voice2"] = st.text_input("Voice 2")
pd_cfg["model"] = st.text_input("Model")
if st.button("Save"):
pd = PodcastConfig(**pd_cfg)
pd.save()
st.success("Saved")
for pd_config in PodcastConfig.get_all():
with st.expander(pd_config.name):
pd_config.name = st.text_input(
"Template Name", value=pd_config.name, key=f"name_{pd_config.id}"
)
pd_config.podcast_name = st.text_input(
"Podcast Name",
value=pd_config.podcast_name,
key=f"podcast_name_{pd_config.id}",
)
pd_config.podcast_tagline = st.text_input(
"Podcast Tagline",
value=pd_config.podcast_tagline,
key=f"podcast_tagline_{pd_config.id}",
)
pd_config.output_language = st.text_input(
"Language",
value=pd_config.output_language,
key=f"output_language_{pd_config.id}",
)
pd_config.person1_role = st.text_input(
"Person 1 role",
value=pd_config.person1_role,
key=f"person1_role_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(participant_roles)}")
pd_config.person2_role = st.text_input(
"Person 2 role",
value=pd_config.person2_role,
key=f"person2_role_{pd_config.id}",
)
pd_config.conversation_style = st_tags(
pd_config.conversation_style,
conversation_styles,
"Conversation Style",
key=f"conversation_style_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(conversation_styles)}")
pd_config.engagement_technique = st_tags(
pd_config.engagement_technique,
engagement_techniques,
"Engagement Techniques",
key=f"engagement_technique_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(engagement_techniques)}")
pd_config.dialogue_structure = st_tags(
pd_config.dialogue_structure,
dialogue_structures,
"Dialogue Structure",
key=f"dialogue_structure_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
pd_config.wordcount = st.slider(
"Word Count",
min_value=400,
max_value=6000,
step=50,
value=pd_config.wordcount,
key=f"wordcount_{pd_config.id}",
)
pd_config.creativity = st.slider(
"Creativity",
min_value=0.0,
max_value=1.0,
step=0.05,
value=pd_config.creativity,
key=f"creativity_{pd_config.id}",
)
pd_config.provider = st.selectbox(
"Provider",
["openai", "elevenlabs", "edge"],
index=["openai", "elevenlabs", "edge"].index(pd_config.provider),
key=f"provider_{pd_config.id}",
)
pd_config.voice1 = st.text_input(
"Voice 1", value=pd_config.voice1, key=f"voice1_{pd_config.id}"
)
pd_config.voice2 = st.text_input(
"Voice 2", value=pd_config.voice2, key=f"voice2_{pd_config.id}"
)
pd_config.model = st.text_input(
"Model", value=pd_config.model, key=f"model_{pd_config.id}"
)
if st.button("Save Config", key=f"btn_save{pd_config.id}"):
pd_config.save()
st.rerun()
if st.button("Delete Config", key=f"btn_delete{pd_config.id}"):
pd_config.delete()
st.rerun()

View file

@ -3,6 +3,7 @@ from langchain_core.runnables import RunnableConfig
from open_notebook.domain import Note, Source
from open_notebook.graphs.chat import graph as chat_graph
from open_notebook.plugins.podcasts import PodcastConfig, PodcastEpisode
from open_notebook.utils import token_count
@ -52,38 +53,59 @@ def execute_chat(txt_input, session_id):
return result
podcast_configs = PodcastConfig.get_all()
podcast_config_names = [pd.name for pd in podcast_configs]
# todo: se eu for usar o token count, preciso deixar configuravel
# seria bom ter um total de tokens no admin em algum lugar
def chat_sidebar(session_id):
context = build_context(session_id=session_id)
tokens = token_count(str(context) + str(st.session_state[session_id]["messages"]))
with st.container(border=True):
request = st.chat_input("Enter your question")
# removing for now since it's not multi-model capable right now
st.caption(f"Total tokens: {tokens}")
if request:
response = execute_chat(txt_input=request, session_id=session_id)
st.session_state[session_id]["messages"] = response["messages"]
chat_tab, podcast_tab = st.tabs(["Chat", "Podcast"])
with podcast_tab:
with st.container(border=True):
template = st.selectbox("Pick a template", podcast_config_names)
episode_name = st.text_input("Episode Name")
instructions = st.text_area("Instructions")
if st.button("Generate"):
epi = PodcastEpisode(
name=episode_name,
instructions=instructions,
template=template,
file_path="lallaa",
)
epi.save()
st.page_link("pages/5_🎙_Podcasts.py", label="Go to Config")
st.divider()
with chat_tab:
with st.container(border=True):
request = st.chat_input("Enter your question")
# removing for now since it's not multi-model capable right now
st.caption(f"Total tokens: {tokens}")
if request:
response = execute_chat(txt_input=request, session_id=session_id)
st.session_state[session_id]["messages"] = response["messages"]
for msg in st.session_state[session_id]["messages"][::-1]:
if msg.type not in ["human", "ai"]:
continue
if not msg.content:
continue
for msg in st.session_state[session_id]["messages"][::-1]:
if msg.type not in ["human", "ai"]:
continue
if not msg.content:
continue
with st.chat_message(name=msg.type):
st.write(msg.content)
if msg.type == "ai":
if st.button("💾 New Note", key=f"render_save_{msg.id}"):
title = "New Note"
content = msg.content
note = Note(
title=title,
content=content,
note_type="ai",
)
note.save()
note.add_to_notebook(
st.session_state[session_id]["notebook"].id
)
st.rerun()
with st.chat_message(name=msg.type):
st.write(msg.content)
if msg.type == "ai":
if st.button("💾 New Note", key=f"render_save_{msg.id}"):
title = "New Note"
content = msg.content
note = Note(
title=title,
content=content,
note_type="ai",
)
note.save()
note.add_to_notebook(
st.session_state[session_id]["notebook"].id
)
st.rerun()